├── __init__.py ├── albert_task ├── __init__.py ├── albert │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── modeling.cpython-36.pyc │ │ ├── bert_utils.cpython-36.pyc │ │ ├── optimization.cpython-36.pyc │ │ ├── tokenization.cpython-36.pyc │ │ └── optimization_finetuning.cpython-36.pyc │ ├── create_pretrain_data.sh │ ├── run.sh │ ├── args.py │ ├── test_changes.py │ └── bert_utils.py ├── ner_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ ├── bilstm_crf.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── test.py │ ├── config │ │ └── msraner_config.json │ ├── README.md │ ├── metrics.py │ ├── predict.py │ └── model.py ├── ltr_pair_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config.json │ ├── metrics.py │ ├── README.md │ └── trainer.py ├── classifier_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── test.py │ ├── config │ │ ├── inews_config.json │ │ ├── tnews_config.json │ │ └── thucnews_config.json │ ├── README.md │ ├── predict.py │ ├── model.py │ ├── metrics.py │ └── trainer.py ├── sentence_pair_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── test.py │ ├── config │ │ ├── bq_config.json │ │ ├── xnli_config.json │ │ └── lcqmc_config.json │ ├── README.md │ ├── metrics.py │ ├── predict.py │ ├── model.py │ └── trainer.py ├── machine_reading_task │ ├── __init__.py │ ├── config │ │ └── cmrc_config.json │ ├── test.py │ └── trainer.py └── ltr_point_task │ ├── run.sh │ ├── config.json │ ├── README.md │ └── metrics.py ├── bert_task ├── __init__.py ├── ner_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ ├── bilstm_crf.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config │ │ └── msraner_config.json │ ├── README.md │ ├── metrics.py │ ├── predict.py │ ├── model.py │ └── trainer.py ├── classifier_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config │ │ ├── inews_config.json │ │ ├── tnews_config.json │ │ └── thucnews_config.json │ ├── README.md │ ├── predict.py │ ├── model.py │ ├── metrics.py │ └── trainer.py ├── ltr_pair_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config.json │ ├── metrics.py │ ├── README.md │ └── trainer.py ├── ltr_point_task │ ├── __init__.py │ ├── __pycache__ │ │ ├── metrics.cpython-36.pyc │ │ ├── model.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config.json │ ├── README.md │ └── metrics.py ├── machine_reading_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config │ │ └── cmrc_config.json │ ├── README.md │ ├── trainer.py │ └── model.py ├── sentence_pair_task │ ├── __init__.py │ ├── run.sh │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── metrics.cpython-36.pyc │ │ └── data_helper.cpython-36.pyc │ ├── config │ │ ├── bq_config.json │ │ ├── lcqmc_config.json │ │ └── xnli_config.json │ ├── README.md │ ├── metrics.py │ ├── predict.py │ ├── model.py │ └── trainer.py └── bert │ ├── requirements.txt │ ├── __init__.py │ ├── CONTRIBUTING.md │ ├── optimization_test.py │ ├── .gitignore │ ├── sample_text.txt │ └── tokenization_test.py └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/albert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/ner_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/ner_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/classifier_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/ltr_point_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/classifier_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/machine_reading_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/machine_reading_task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /albert_task/ltr_point_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config.json -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config.json 2 | -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config.json 2 | -------------------------------------------------------------------------------- /bert_task/ner_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/msraner_config.json -------------------------------------------------------------------------------- /albert_task/ner_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/msraner_config.json -------------------------------------------------------------------------------- /bert_task/classifier_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/tnews_config.json -------------------------------------------------------------------------------- /albert_task/classifier_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/tnews_config.json -------------------------------------------------------------------------------- /bert_task/machine_reading_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/tnews_config.json -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/bq_config.json 2 | -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/run.sh: -------------------------------------------------------------------------------- 1 | python trainer.py --config_path=config/bq_config.json 2 | -------------------------------------------------------------------------------- /bert_task/bert/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow. 2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. 3 | -------------------------------------------------------------------------------- /bert_task/ner_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/albert/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/albert/__pycache__/modeling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/modeling.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ner_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ner_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/albert/__pycache__/bert_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/bert_utils.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ner_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/albert/__pycache__/optimization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/optimization.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/albert/__pycache__/tokenization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/tokenization.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ner_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/classifier_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/classifier_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ltr_point_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_point_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ltr_point_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_point_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ner_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/classifier_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/classifier_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/classifier_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/classifier_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/classifier_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/classifier_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/ltr_point_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_point_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/machine_reading_task/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/machine_reading_task/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/machine_reading_task/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/machine_reading_task/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/albert/__pycache__/optimization_finetuning.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/optimization_finetuning.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/machine_reading_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/machine_reading_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /albert_task/classifier_task/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from predict import Predictor 4 | 5 | 6 | with open("config/tnews_config.json", "r") as fr: 7 | config = json.load(fr) 8 | 9 | 10 | predictor = Predictor(config) 11 | text = "歼20座舱盖上的两条“花纹”是什么?" 12 | res = predictor.predict(text) 13 | print(res) -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from predict import Predictor 4 | 5 | 6 | with open("config/bq_config.json", "r") as fr: 7 | config = json.load(fr) 8 | 9 | 10 | predictor = Predictor(config) 11 | 12 | text_a = "为什么我无法看到额度" 13 | text_b = "为什么开通了却没有额度" 14 | res = predictor.predict(text_a, text_b) 15 | print(res) -------------------------------------------------------------------------------- /albert_task/albert/create_pretrain_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BERT_BASE_DIR=./albert_config 4 | python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=data/news_zh_1.txt \ 5 | --output_file=data/tf_news_2016_zh_raw_news2016zh_1.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \ 6 | --max_seq_length=512 --max_predictions_per_seq=51 --masked_lm_prob=0.10 -------------------------------------------------------------------------------- /albert_task/ner_task/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from predict import Predictor 4 | 5 | with open("config/msraner_config.json", "r") as fr: 6 | config = json.load(fr) 7 | 8 | text = "中 共 中 央 致 中 国 致 公 党 十 一 大 的 贺 词" 9 | text = text.split(" ") 10 | predictor = Predictor(config) 11 | chunks = predictor.predict(text) 12 | 13 | for chunk in chunks: 14 | entity_name, start, end = chunk 15 | entity = "".join(text[start - 1: end]) 16 | print(entity_name, entity) 17 | -------------------------------------------------------------------------------- /albert_task/ltr_point_task/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ltr_point", 3 | "epochs": 5, 4 | "checkpoint_every": 2000, 5 | "eval_every": 2000, 6 | "learning_rate": 3e-5, 7 | "sequence_length": 64, 8 | "batch_size": 16, 9 | "neg_threshold": 0.4, 10 | "warmup_rate": 0.1, 11 | "output_path": "output", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/lcqmc/train.tsv", 14 | "eval_data": "data/lcqmc/dev.tsv", 15 | "ckpt_model_path": "ckpt_model/lcqmc" 16 | } -------------------------------------------------------------------------------- /bert_task/ltr_point_task/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ltr_point", 3 | "epochs": 5, 4 | "checkpoint_every": 10000, 5 | "eval_every": 10000, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 64, 8 | "batch_size": 16, 9 | "neg_threshold": 0.4, 10 | "warmup_rate": 0.1, 11 | "output_path": "output", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/dssm/train.tsv", 14 | "eval_data": "data/dssm/dev.tsv", 15 | "ckpt_model_path": "ckpt_model/" 16 | } -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/config/bq_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 1, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/bq", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/bq/dev.txt", 14 | "eval_data": "data/bq/dev.txt", 15 | "ckpt_model_path": "ckpt_model/bq" 16 | } -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ltr_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 500, 5 | "eval_every": 500, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 32, 8 | "batch_size": 8, 9 | "num_samples": 2, 10 | "train_n_tasks": 1000, 11 | "eval_n_tasks": 500, 12 | "margin": 0.5, 13 | "warmup_rate": 0.1, 14 | "output_path": "output", 15 | "bert_model_path": "../albert_model/albert_tiny", 16 | "data": "data/data.json", 17 | "ckpt_model_path": "ckpt_model/" 18 | } -------------------------------------------------------------------------------- /bert_task/ltr_point_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | 4 | * model_name:模型名称 5 | * epochs:迭代epoch的数量 6 | * checkpoint_every:间隔多少步保存一次模型 7 | * eval_every:间隔多少步验证一次模型 8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 9 | * sequence_length:序列长度,单GPU时不要超过128 10 | * batch_size:单GPU时不要超过32 11 | * neg_threshold:对比损失中的负样本临界值 12 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 13 | * output_path:输出文件夹,用来存储label_to_index等文件 14 | * bert_model_path:预训练模型文件夹路径 15 | * train_data:训练数据路径 16 | * eval_data:验证数据路径 17 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/config/bq_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 1, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/bq", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/bq/dev.txt", 14 | "eval_data": "data/bq/dev.txt", 15 | "ckpt_model_path": "ckpt_model/bq" 16 | } -------------------------------------------------------------------------------- /albert_task/classifier_task/config/inews_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "classifier", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 3, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/inews", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/inews/dev.txt", 14 | "eval_data": "data/inews/dev.txt", 15 | "ckpt_model_path": "ckpt_model/inews" 16 | } -------------------------------------------------------------------------------- /albert_task/classifier_task/config/tnews_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "classifier", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 15, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/tnews", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/tnews/dev.txt", 14 | "eval_data": "data/tnews/dev.txt", 15 | "ckpt_model_path": "ckpt_model/tnews" 16 | } -------------------------------------------------------------------------------- /albert_task/ltr_point_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | 4 | * model_name:模型名称 5 | * epochs:迭代epoch的数量 6 | * checkpoint_every:间隔多少步保存一次模型 7 | * eval_every:间隔多少步验证一次模型 8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 9 | * sequence_length:序列长度,单GPU时不要超过128 10 | * batch_size:单GPU时不要超过32 11 | * neg_threshold:对比损失中的负样本临界值 12 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 13 | * output_path:输出文件夹,用来存储label_to_index等文件 14 | * bert_model_path:预训练模型文件夹路径 15 | * train_data:训练数据路径 16 | * eval_data:验证数据路径 17 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/config/xnli_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 10000, 5 | "eval_every": 10000, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 3, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/xnli", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/xnli/dev.txt", 14 | "eval_data": "data/xnli/dev.txt", 15 | "ckpt_model_path": "ckpt_model/xnli" 16 | } -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/config/lcqmc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 1, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/lcqmc", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/lcqmc/dev.txt", 14 | "eval_data": "data/lcqmc/dev.txt", 15 | "ckpt_model_path": "ckpt_model/lcqmc" 16 | } -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ltr_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 500, 5 | "eval_every": 500, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 32, 8 | "batch_size": 8, 9 | "num_samples": 2, 10 | "train_n_tasks": 100000, 11 | "eval_n_tasks": 500, 12 | "margin": 0.5, 13 | "warmup_rate": 0.1, 14 | "output_path": "output", 15 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 16 | "data": "data/data.json", 17 | "ckpt_model_path": "ckpt_model/" 18 | } -------------------------------------------------------------------------------- /bert_task/classifier_task/config/inews_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "classifier", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 3, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/inews", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/inews/train.txt", 14 | "eval_data": "data/inews/dev.txt", 15 | "ckpt_model_path": "ckpt_model/inews" 16 | } -------------------------------------------------------------------------------- /bert_task/classifier_task/config/tnews_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "classifier", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 15, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/tnews", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/tnews/train.txt", 14 | "eval_data": "data/tnews/dev.txt", 15 | "ckpt_model_path": "ckpt_model/tnews" 16 | } -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/config/lcqmc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 1, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/lcqmc", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/lcqmc/dev.txt", 14 | "eval_data": "data/lcqmc/dev.txt", 15 | "ckpt_model_path": "ckpt_model/lcqmc" 16 | } -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/config/xnli_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_pair", 3 | "epochs": 5, 4 | "checkpoint_every": 10000, 5 | "eval_every": 10000, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 3, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/xnli", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/xnli/dev.txt", 14 | "eval_data": "data/xnli/dev.txt", 15 | "ckpt_model_path": "ckpt_model/xnli" 16 | } -------------------------------------------------------------------------------- /albert_task/classifier_task/config/thucnews_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "classifier", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 14, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/thucnews", 12 | "bert_model_path": "../albert_model/albert_tiny", 13 | "train_data": "data/thucnews/dev.txt", 14 | "eval_data": "data/thucnews/dev.txt", 15 | "ckpt_model_path": "ckpt_model/thucnews" 16 | } -------------------------------------------------------------------------------- /bert_task/classifier_task/config/thucnews_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "classifier", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "sequence_length": 128, 8 | "batch_size": 32, 9 | "num_classes": 14, 10 | "warmup_rate": 0.1, 11 | "output_path": "output/thucnews", 12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 13 | "train_data": "data/thucnews/train.txt", 14 | "eval_data": "data/thucnews/dev.txt", 15 | "ckpt_model_path": "ckpt_model/thucnews" 16 | } -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### 以bq_config.json为例 4 | 5 | * model_name:模型名称 6 | * epochs:迭代epoch的数量 7 | * checkpoint_every:间隔多少步保存一次模型 8 | * eval_every:间隔多少步验证一次模型 9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 10 | * sequence_length:序列长度,单GPU时不要超过128 11 | * batch_size:单GPU时不要超过32 12 | * num_classes:类别数量,若是二分类设置为1 13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 14 | * output_path:输出文件夹,用来存储label_to_index等文件 15 | * bert_model_path:预训练模型文件夹路径 16 | * train_data:训练数据路径 17 | * eval_data:验证数据路径 18 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### 以bq_config.json为例 4 | 5 | * model_name:模型名称 6 | * epochs:迭代epoch的数量 7 | * checkpoint_every:间隔多少步保存一次模型 8 | * eval_every:间隔多少步验证一次模型 9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 10 | * sequence_length:序列长度,单GPU时不要超过128 11 | * batch_size:单GPU时不要超过32 12 | * num_classes:类别数量,若是二分类设置为1 13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 14 | * output_path:输出文件夹,用来存储label_to_index等文件 15 | * bert_model_path:预训练模型文件夹路径 16 | * train_data:训练数据路径 17 | * eval_data:验证数据路径 18 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /albert_task/classifier_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### 以inews_config.json为例 4 | 5 | * model_name:模型名称 6 | * epochs:迭代epoch的数量 7 | * checkpoint_every:间隔多少步保存一次模型 8 | * eval_every:间隔多少步验证一次模型 9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 10 | * sequence_length:序列长度,单GPU时不要超过128 11 | * batch_size:单GPU时不要超过32 12 | * num_classes:文本分类的类别数量,若是二分类设置为1 13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 14 | * output_path:输出文件夹,用来存储label_to_index等文件 15 | * bert_model_path:预训练模型文件夹路径 16 | * train_data:训练数据路径 17 | * eval_data:验证数据路径 18 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /bert_task/classifier_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### 以inews_config.json为例 4 | 5 | * model_name:模型名称 6 | * epochs:迭代epoch的数量 7 | * checkpoint_every:间隔多少步保存一次模型 8 | * eval_every:间隔多少步验证一次模型 9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 10 | * sequence_length:序列长度,单GPU时不要超过128 11 | * batch_size:单GPU时不要超过32 12 | * num_classes:文本分类的类别数量,若是二分类设置为1 13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 14 | * output_path:输出文件夹,用来存储label_to_index等文件 15 | * bert_model_path:预训练模型文件夹路径 16 | * train_data:训练数据路径 17 | * eval_data:验证数据路径 18 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_ys): 18 | """ 19 | 20 | :param pred_ys: 21 | :return: 22 | """ 23 | correct = 0 24 | for pred_y in pred_ys: 25 | if pred_y == 0: 26 | correct += 1 27 | 28 | return round(correct / len(pred_ys), 4) -------------------------------------------------------------------------------- /albert_task/ner_task/config/msraner_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ner", 3 | "epochs": 5, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 128, 8 | "ner_layers": [128], 9 | "ner_hidden_sizes": [128], 10 | "batch_size": 16, 11 | "num_classes": 7, 12 | "keep_prob": 0.9, 13 | "warmup_rate": 0.1, 14 | "output_path": "output", 15 | "bert_model_path": "../albert_model/albert_tiny", 16 | "train_data": "data/msraner/test.txt", 17 | "eval_data": "data/msraner/test.txt", 18 | "ckpt_model_path": "ckpt_model/" 19 | } -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_ys): 18 | """ 19 | 20 | :param pred_ys: 21 | :return: 22 | """ 23 | correct = 0 24 | for pred_y in pred_ys: 25 | if pred_y == 0: 26 | correct += 1 27 | 28 | return round(correct / len(pred_ys), 4) -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | 4 | * model_name:模型名称 5 | * epochs:迭代epoch的数量 6 | * checkpoint_every:间隔多少步保存一次模型 7 | * eval_every:间隔多少步验证一次模型 8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 9 | * sequence_length:序列长度,单GPU时不要超过128 10 | * batch_size:单GPU时不要超过32 11 | * num_samples:文本分类的类别数量 12 | * train_n_tasks:每个epoch下采样的训练任务 13 | * eval_n_tasks:每次eval时采样的验证任务 14 | * margin:triplet loss中的间隔值,建议0.5-0.7之间 15 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 16 | * output_path:输出文件夹,用来存储label_to_index等文件 17 | * bert_model_path:预训练模型文件夹路径 18 | * data:数据路径 19 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | 4 | * model_name:模型名称 5 | * epochs:迭代epoch的数量 6 | * checkpoint_every:间隔多少步保存一次模型 7 | * eval_every:间隔多少步验证一次模型 8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 9 | * sequence_length:序列长度,单GPU时不要超过128 10 | * batch_size:单GPU时不要超过32 11 | * num_samples:文本分类的类别数量 12 | * train_n_tasks:每个epoch下采样的训练任务 13 | * eval_n_tasks:每次eval时采样的验证任务 14 | * margin:triplet loss中的间隔值,建议0.5-0.7之间 15 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 16 | * output_path:输出文件夹,用来存储label_to_index等文件 17 | * bert_model_path:预训练模型文件夹路径 18 | * data:数据路径 19 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /bert_task/ner_task/config/msraner_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ner", 3 | "epochs": 5, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 2e-5, 7 | "sequence_length": 128, 8 | "ner_layers": [128], 9 | "ner_hidden_sizes": [128], 10 | "batch_size": 16, 11 | "num_classes": 7, 12 | "keep_prob": 0.9, 13 | "warmup_rate": 0.1, 14 | "output_path": "output", 15 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 16 | "train_data": "data/msraner/test.txt", 17 | "eval_data": "data/msraner/test.txt", 18 | "ckpt_model_path": "ckpt_model/" 19 | } -------------------------------------------------------------------------------- /bert_task/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /albert_task/ner_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### albert + bilstm + crf 4 | ##### 以inews_config.json为例 5 | 6 | * model_name:模型名称 7 | * epochs:迭代epoch的数量 8 | * checkpoint_every:间隔多少步保存一次模型 9 | * eval_every:间隔多少步验证一次模型 10 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 11 | * sequence_length:序列长度,单GPU时不要超过128 12 | * batch_size:单GPU时不要超过32 13 | * ner_layers:lstm中的隐层大小 14 | * ner_hidden_sizes:bilstm-ner中的全连接层中的隐层大小 15 | * keep_prob:bilstm-ner中全连接层中的dropout比例,该值等于1-dropout rate 16 | * num_classes:文本分类的类别数量 17 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 18 | * output_path:输出文件夹,用来存储label_to_index等文件 19 | * bert_model_path:预训练模型文件夹路径 20 | * train_data:训练数据路径 21 | * eval_data:验证数据路径 22 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /bert_task/ner_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### albert + bilstm + crf 4 | ##### 以inews_config.json为例 5 | 6 | * model_name:模型名称 7 | * epochs:迭代epoch的数量 8 | * checkpoint_every:间隔多少步保存一次模型 9 | * eval_every:间隔多少步验证一次模型 10 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 11 | * sequence_length:序列长度,单GPU时不要超过128 12 | * batch_size:单GPU时不要超过32 13 | * ner_layers:lstm中的隐层大小 14 | * ner_hidden_sizes:bilstm-ner中的全连接层中的隐层大小 15 | * keep_prob:bilstm-ner中全连接层中的dropout比例,该值等于1-dropout rate 16 | * num_classes:文本分类的类别数量 17 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 18 | * output_path:输出文件夹,用来存储label_to_index等文件 19 | * bert_model_path:预训练模型文件夹路径 20 | * train_data:训练数据路径 21 | * eval_data:验证数据路径 22 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /albert_task/machine_reading_task/config/cmrc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "machine_reading", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 5e-5, 7 | "max_length": 512, 8 | "doc_stride": 128, 9 | "query_length": 64, 10 | "max_answer_length": 30, 11 | "n_best_size": 20, 12 | "batch_size": 32, 13 | "warmup_rate": 0.1, 14 | "output_path": "output/cmrc2018", 15 | "output_predictions_path": "output/cmrc2018/predictions.json", 16 | "output_nbest_path": "output/cmrc2018/nbset.json", 17 | "bert_model_path": "../albert_model/albert_tiny", 18 | "train_data": "data/cmrc2018/cmrc2018_train.json", 19 | "eval_data": "data/cmrc2018/cmrc2018_dev.json", 20 | "ckpt_model_path": "ckpt_model/cmrc2018" 21 | } -------------------------------------------------------------------------------- /bert_task/machine_reading_task/config/cmrc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "machine_reading", 3 | "epochs": 10, 4 | "checkpoint_every": 1000, 5 | "eval_every": 1000, 6 | "learning_rate": 2e-5, 7 | "max_length": 512, 8 | "doc_stride": 128, 9 | "query_length": 64, 10 | "max_answer_length": 30, 11 | "n_best_size": 20, 12 | "batch_size": 8, 13 | "warmup_rate": 0.1, 14 | "output_path": "output/cmrc2018", 15 | "output_predictions_path": "output/cmrc2018/predictions.json", 16 | "output_nbest_path": "output/cmrc2018/nbset.json", 17 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12", 18 | "train_data": "data/cmrc2018/cmrc2018_train.json", 19 | "eval_data": "data/cmrc2018/cmrc2018_dev.json", 20 | "ckpt_model_path": "ckpt_model/cmrc2018" 21 | } -------------------------------------------------------------------------------- /bert_task/machine_reading_task/README.md: -------------------------------------------------------------------------------- 1 | #### config文件解读 2 | 3 | ##### 以cmrc_config.json为例 4 | 5 | * model_name:模型名称 6 | * epochs:迭代epoch的数量 7 | * checkpoint_every:间隔多少步保存一次模型 8 | * eval_every:间隔多少步验证一次模型 9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 10 | * max_length:输入到模型中的最大长度,建议设置为512 11 | * doc_stride:对于context长度较长的时候会分成多个doc,采用滑动窗口的形式分doc,这个是滑动窗口的大小,建议设为128 12 | * query_length:输入的问题的最大长度 13 | * max_answer_length:生成的回答的最大长度 14 | * n_best_size:获取分数最高的前n个 15 | * batch_size:单GPU时不要超过32 16 | * num_classes:文本分类的类别数量 17 | * warmup_rate:训练时的预热比例,建议0.05, 0.1 18 | * output_path:输出文件夹,用来存储label_to_index等文件 19 | * output_predictions_path:训练时在验证集上预测的最佳结果保存路径 20 | * output_nbest_path:训练时在验证集上预测的n个最佳结果的保存路径 21 | * bert_model_path:预训练模型文件夹路径 22 | * train_data:训练数据路径 23 | * eval_data:验证数据路径 24 | * ckpt_model_path:checkpoint模型文件保存路径 -------------------------------------------------------------------------------- /albert_task/albert/run.sh: -------------------------------------------------------------------------------- 1 | python run_classifier.py --task_name=lcqmc --do_train=true --do_eval=true --data_dir=../task_data/lcqmc --vocab_file=pre_trained_model/albert_tiny/vocab.txt --bert_config_file=pre_trained_model/albert_tiny/albert_config_tiny.json --max_seq_length=128 --train_batch_size=64 --learning_rate=1e-4 --num_train_epochs=5 --output_dir=output/lcqmc --init_checkpoint=pre_trained_model/albert_tiny/albert_model.ckpt 2 | 3 | 4 | python run_classifier.py --task_name=tnews --do_train=true --do_eval=true --data_dir=../task_data/tnews --vocab_file=pre_trained_model/albert_large/vocab.txt --bert_config_file=pre_trained_model/albert_large/albert_config_large.json --max_seq_length=128 --train_batch_size=8 --learning_rate=2e-5 --num_train_epochs=5 --output_dir=output/tnews --init_checkpoint=pre_trained_model/albert_large/albert_model.ckpt 5 | -------------------------------------------------------------------------------- /albert_task/albert/args.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | tf.logging.set_verbosity(tf.logging.INFO) 5 | 6 | file_path = os.path.dirname(__file__) 7 | 8 | 9 | #模型目录 10 | model_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/') 11 | 12 | #config文件 13 | config_name = os.path.join(file_path, 'albert_config/albert_config.json') 14 | #ckpt文件名称 15 | ckpt_name = os.path.join(model_dir, 'model.ckpt') 16 | #输出文件目录 17 | output_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/') 18 | #vocab文件目录 19 | vocab_file = os.path.join(file_path, 'albert_config/vocab.txt') 20 | #数据目录 21 | data_dir = os.path.join(file_path, 'data/') 22 | 23 | num_train_epochs = 10 24 | batch_size = 128 25 | learning_rate = 0.00005 26 | 27 | # gpu使用率 28 | gpu_memory_fraction = 0.8 29 | 30 | # 默认取倒数第二层的输出值作为句向量 31 | layer_indexes = [-2] 32 | 33 | # 序列的最大程度,单文本建议把该值调小 34 | max_seq_len = 128 35 | 36 | # graph名字 37 | graph_file = os.path.join(file_path, 'albert_lcqmc_checkpoints/graph') -------------------------------------------------------------------------------- /albert_task/machine_reading_task/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | from predict import Predictor 3 | 4 | 5 | with open("config/cmrc_config.json", "r") as fr: 6 | config = json.load(fr) 7 | 8 | 9 | predictor = Predictor(config) 10 | query = "锣鼓经运用的程式是什么?" 11 | context = "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的" \ 12 | "演奏方法。常用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子" \ 13 | "和尾声,提示音乐的板式和速度,以及作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定" \ 14 | "俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果," \ 15 | "如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单皮鼓(板鼓)、大鼓、" \ 16 | "大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、" \ 17 | "大钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责," \ 18 | "又称为四大件,领奏的师傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不" \ 19 | "同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、" \ 20 | "昆剧和粤剧锣鼓中乐器对应的口诀用字:" 21 | 22 | answer = predictor.predict(query, context) 23 | print(answer) -------------------------------------------------------------------------------- /bert_task/bert/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | BERT needs to maintain permanent compatibility with the pre-trained model files, 4 | so we do not plan to make any major changes to this library (other than what was 5 | promised in the README). However, we can accept small patches related to 6 | re-factoring and documentation. To submit contributes, there are just a few 7 | small guidelines you need to follow. 8 | 9 | ## Contributor License Agreement 10 | 11 | Contributions to this project must be accompanied by a Contributor License 12 | Agreement. You (or your employer) retain the copyright to your contribution; 13 | this simply gives us permission to use and redistribute your contributions as 14 | part of the project. Head over to to see 15 | your current agreements on file or to sign a new one. 16 | 17 | You generally only need to submit a CLA once, so if you've already submitted one 18 | (even if it was for a different project), you probably don't need to do it 19 | again. 20 | 21 | ## Code reviews 22 | 23 | All submissions, including submissions by project members, require review. We 24 | use GitHub pull requests for this purpose. Consult 25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 26 | information on using pull requests. 27 | 28 | ## Community Guidelines 29 | 30 | This project follows 31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### BERT和ALBERT在下游任务中的应用 2 | #### 本项目提供了易用的训练模式和预测模式,可以直接部署。也容易扩展到任何下游任务中 3 | 4 | #### albert_task和bert_task文件夹中的内容基本一致 5 | * albert_task/albert是albert的源码 6 | * albert_task/albert_model中包含了四种albert的模型:albert_tiny, albert_base, albert_large, albert_xlarge 7 | * bert_task/bert是bert的源码 8 | * bert_task/bert_model中包含了中文bert模型 9 | * 需要下载albert的预训练模型放置在albert_task下,bert的预训练模型放置在bert_task下 10 | * 预训练模型的路径可以在xxx_config.json文件中配置 11 | 12 | #### 目前提供了5大类的任务,classifier,sentence pair,ner,learning to rank(pair wise),machine reading。基准数据集来自chineseGLUE 13 | * classifier包括tnews,inews,thucnews 14 | * sentence pair包括bq,lcqmc,xnli 15 | * ner包括msraner 16 | * learning to rank(pair wise)是biendata上 **基于Adversarial Attack的问题等价性判别比赛** 17 | * machine reading包括cmrc2018 18 | 19 | #### 每个任务下的结构基本一致 20 | * config:放置每个具体任务的配置文件,包括训练参数,数据路径,模型存储路径 21 | * data_helper.py:数据预处理文件 22 | * metrics.py:性能指标文件 23 | * model.py:模型文件,可以很容易的实现bert和下游网络层的结合 24 | * trainer.py:训练模型 25 | * predict.py:预测代码,只需要实例化Predictor类,调用predict方法就可以预测 26 | 27 | #### 训练数据格式 28 | ##### 文本分类数据格式 29 | * title \ content \ label:有的数据中含有标题,有的只有正文,标题,正文,标签之间用\符号分隔。 30 | ##### 句子对数据格式 31 | * sentence A\sentence B\label:同样对于两个句子和标签采用\符号分隔。 32 | ##### ner数据格式 33 | ###### 我们采用了BIO的格式标注,也可以采用BIOS, BIEO, BIEOS标注,将输入中的词和标注都用\t分隔。 34 | * 慕 名 前 来 品 尝 玉 峰 茶 , 领 略 茶 文 化 的 人 越 来 越 多 。\o o o o o o B-ns I-ns o o o o o o o o o o o o o o 35 | ##### 阅读理解数据格式 36 | * context:抽取式阅读理解的上下文 37 | * question:问题 38 | * answer:答案,从context中抽取一个片段 39 | * start_position: answer的起始位置 40 | * end_position: answer的终止位置 41 | ##### learning_to_rank 42 | * point wise:随机采样正样本对和负样本对组成一个样本对的分类问题,和句子对数据格式一致。 43 | * pair wise:给定一个query,抽取一个和query相似的正样本,抽取若干个和query不相似的负样本。 44 | #### 训练模型 45 | * 执行每个任务下的sh脚本即可,sh run.sh。只需要更改配置文件就可以训练不同的模型 46 | 47 | #### 预测 48 | * 执行albert_task中每个任务下的test.py文件就可以预测,bert_task同albert_task。 -------------------------------------------------------------------------------- /bert_task/bert/optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import optimization 20 | import tensorflow as tf 21 | 22 | 23 | class OptimizationTest(tf.test.TestCase): 24 | 25 | def test_adam(self): 26 | with self.test_session() as sess: 27 | w = tf.get_variable( 28 | "w", 29 | shape=[3], 30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1])) 31 | x = tf.constant([0.4, 0.2, -0.5]) 32 | loss = tf.reduce_mean(tf.square(x - w)) 33 | tvars = tf.trainable_variables() 34 | grads = tf.gradients(loss, tvars) 35 | global_step = tf.train.get_or_create_global_step() 36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) 37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) 38 | init_op = tf.group(tf.global_variables_initializer(), 39 | tf.local_variables_initializer()) 40 | sess.run(init_op) 41 | for _ in range(100): 42 | sess.run(train_op) 43 | w_np = sess.run(w) 44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) 45 | 46 | 47 | if __name__ == "__main__": 48 | tf.test.main() 49 | -------------------------------------------------------------------------------- /bert_task/bert/.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | -------------------------------------------------------------------------------- /albert_task/albert/test_changes.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import tensorflow as tf 3 | from modeling import embedding_lookup_factorized,transformer_model 4 | import os 5 | 6 | """ 7 | 测试albert主要的改进点:词嵌入的因式分解、层间参数共享、段落间连贯性 8 | test main change of albert from bert 9 | """ 10 | batch_size = 2048 11 | sequence_length = 512 12 | vocab_size = 30000 13 | hidden_size = 1024 14 | num_attention_heads = int(hidden_size / 64) 15 | 16 | def get_total_parameters(): 17 | """ 18 | get total parameters of a graph 19 | :return: 20 | """ 21 | total_parameters = 0 22 | for variable in tf.trainable_variables(): 23 | # shape is an array of tf.Dimension 24 | shape = variable.get_shape() 25 | # print(shape) 26 | # print(len(shape)) 27 | variable_parameters = 1 28 | for dim in shape: 29 | # print(dim) 30 | variable_parameters *= dim.value 31 | # print(variable_parameters) 32 | total_parameters += variable_parameters 33 | return total_parameters 34 | 35 | def test_factorized_embedding(): 36 | """ 37 | test of Factorized embedding parameterization 38 | :return: 39 | """ 40 | input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32) 41 | output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size) 42 | print("output:",output) 43 | 44 | def test_share_parameters(): 45 | """ 46 | test of share parameters across all layers: how many parameter after share parameter across layers of transformer. 47 | :return: 48 | """ 49 | def total_parameters_transformer(share_parameter_across_layers): 50 | input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32) 51 | print("transformer_model. input:",input_tensor) 52 | transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers) 53 | print("transformer_result:",transformer_result) 54 | total_parameters=get_total_parameters() 55 | print('total_parameters(not share):',total_parameters) 56 | 57 | share_parameter_across_layers=False 58 | total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million 59 | 60 | tf.reset_default_graph() # Clears the default graph stack and resets the global default graph 61 | share_parameter_across_layers=True 62 | total_parameters_transformer(share_parameter_across_layers) # total parameters, share: 10,498,048 = 10.5 million 63 | 64 | def test_sentence_order_prediction(): 65 | """ 66 | sentence order prediction. 67 | 68 | check method of create_instances_from_document_albert from create_pretrining_data.py 69 | 70 | :return: 71 | """ 72 | # 添加运行权限 73 | os.system("chmod +x create_pretrain_data.sh") 74 | 75 | os.system("./create_pretrain_data.sh") 76 | 77 | 78 | # 1.test of Factorized embedding parameterization 79 | #test_factorized_embedding() 80 | 81 | # 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer. 82 | # before share parameter: 125,976,576; after share parameter: 83 | #test_share_parameters() 84 | 85 | # 3. test of sentence order prediction(SOP) 86 | test_sentence_order_prediction() 87 | 88 | -------------------------------------------------------------------------------- /bert_task/ner_task/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | 定义性能指标函数 4 | """ 5 | 6 | 7 | def mean(item): 8 | return sum(item) / len(item) 9 | 10 | 11 | def get_chunk_type(index, index_to_label): 12 | """ 13 | 对实体的标签进行分割,返回实体的位置和实体的名称 14 | """ 15 | label_name = index_to_label[index] 16 | label_class, label_type = label_name.split("-") 17 | 18 | return label_name, label_class, label_type 19 | 20 | 21 | def get_chunk(sequence, label_to_index): 22 | """ 23 | 给定一个标注序列,将实体和位置组合起来,放置在一个列表中 24 | """ 25 | unentry = [label_to_index["o"]] 26 | index_to_label = {index: label for label, index in label_to_index.items()} 27 | chunks = [] 28 | chunk_type, chunk_start = None, None 29 | for index, label in enumerate(sequence): 30 | if label in unentry: 31 | # 如果非实体词 32 | if chunk_type is None: 33 | # 若chunk_type为None,表明上一个词是非实体,继续跳过 34 | continue 35 | else: 36 | # 若chunkType非None,则上面的是一个实体,而当前非实体,则将上一个实体chunk加入到chunks中 37 | # 主要为序列中的这种情况,O,B-PER,I-PER,O 这也是最常见的情况 38 | chunk = (chunk_type, chunk_start, index-1) 39 | chunks.append(chunk) 40 | chunk_type, chunk_start = None, None 41 | 42 | if label not in unentry: 43 | # 如果是实体词,在这里的label是索引表示的label 44 | label_name, label_chunk_class, label_chunk_type = get_chunk_type(label, index_to_label) 45 | if chunk_type is None: 46 | # 若当前chunk_type为None,则表明上一个词是非实体词 47 | chunk_type, chunk_start = label_chunk_type, index 48 | elif label_chunk_type == chunk_type: 49 | # 若实体类型和上一个相同,则做如下判断 50 | if index == (len(sequence) - 1): 51 | # 若当前词是序列中的最后一个词,则直接返回chunk 52 | chunk = (chunk_type, chunk_start, index) 53 | chunks.append(chunk) 54 | 55 | # 若出现两个相同的实体连在一块,则做如下操作 56 | elif label_chunk_class == "B": 57 | chunk = (chunk_type, chunk_start, index - 1) 58 | chunks.append(chunk) 59 | chunk_type, chunk_start = label_chunk_type, index 60 | else: 61 | # 若当前非最后一个词,则跳过 62 | continue 63 | elif label_chunk_type != chunk_type: 64 | # 若当前词和上一个词类型不同,则将上一个实体chunk加入到chunks中,接着继续下一个chunk 65 | # 主要体现在两个实体相连的序列中,如B-PER,I-PER,B-LOC,I-LOC 66 | chunk = (chunk_type, chunk_start, index-1) 67 | chunks.append(chunk) 68 | chunk_type, chunk_start = label_chunk_type, index 69 | 70 | return chunks 71 | 72 | 73 | def gen_metrics(true_y, pred_y, label_to_index): 74 | """ 75 | 生成f1值,recall, precision 76 | precision = 识别的正确实体数/识别出的实体数 77 | recall = 识别的正确实体数/样本的实体数 78 | """ 79 | correct_preds = 0 # 识别出的正确实体数 80 | all_preds = 0 # 识别出的实体数 81 | all_trues = 0 # 样本的真实实体数 82 | 83 | true_chunks = get_chunk(true_y.tolist(), label_to_index) 84 | pred_chunks = get_chunk(pred_y.tolist(), label_to_index) 85 | correct_preds += len(set(true_chunks) & set(pred_chunks)) 86 | all_preds += len(pred_chunks) 87 | all_trues += len(true_chunks) 88 | 89 | precision = correct_preds / all_preds if correct_preds > 0 else 0 90 | recall = correct_preds / all_trues if correct_preds > 0 else 0 91 | f1 = 2 * precision * recall / (precision + recall) if correct_preds > 0 else 0 92 | 93 | return round(f1, 4), round(precision, 4), round(recall, 4) -------------------------------------------------------------------------------- /albert_task/ner_task/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | 定义性能指标函数 4 | """ 5 | 6 | 7 | def mean(item): 8 | return sum(item) / len(item) 9 | 10 | 11 | def get_chunk_type(index, index_to_label): 12 | """ 13 | 对实体的标签进行分割,返回实体的位置和实体的名称 14 | """ 15 | label_name = index_to_label[index] 16 | label_class, label_type = label_name.split("-") 17 | 18 | return label_name, label_class, label_type 19 | 20 | 21 | def get_chunk(sequence, label_to_index): 22 | """ 23 | 给定一个标注序列,将实体和位置组合起来,放置在一个列表中 24 | """ 25 | unentry = [label_to_index["o"]] 26 | index_to_label = {index: label for label, index in label_to_index.items()} 27 | chunks = [] 28 | chunk_type, chunk_start = None, None 29 | for index, label in enumerate(sequence): 30 | if label in unentry: 31 | # 如果非实体词 32 | if chunk_type is None: 33 | # 若chunk_type为None,表明上一个词是非实体,继续跳过 34 | continue 35 | else: 36 | # 若chunkType非None,则上面的是一个实体,而当前非实体,则将上一个实体chunk加入到chunks中 37 | # 主要为序列中的这种情况,O,B-PER,I-PER,O 这也是最常见的情况 38 | chunk = (chunk_type, chunk_start, index-1) 39 | chunks.append(chunk) 40 | chunk_type, chunk_start = None, None 41 | 42 | if label not in unentry: 43 | # 如果是实体词,在这里的label是索引表示的label 44 | label_name, label_chunk_class, label_chunk_type = get_chunk_type(label, index_to_label) 45 | if chunk_type is None: 46 | # 若当前chunk_type为None,则表明上一个词是非实体词 47 | chunk_type, chunk_start = label_chunk_type, index 48 | elif label_chunk_type == chunk_type: 49 | # 若实体类型和上一个相同,则做如下判断 50 | if index == (len(sequence) - 1): 51 | # 若当前词是序列中的最后一个词,则直接返回chunk 52 | chunk = (chunk_type, chunk_start, index) 53 | chunks.append(chunk) 54 | 55 | # 若出现两个相同的实体连在一块,则做如下操作 56 | elif label_chunk_class == "B": 57 | chunk = (chunk_type, chunk_start, index - 1) 58 | chunks.append(chunk) 59 | chunk_type, chunk_start = label_chunk_type, index 60 | else: 61 | # 若当前非最后一个词,则跳过 62 | continue 63 | elif label_chunk_type != chunk_type: 64 | # 若当前词和上一个词类型不同,则将上一个实体chunk加入到chunks中,接着继续下一个chunk 65 | # 主要体现在两个实体相连的序列中,如B-PER,I-PER,B-LOC,I-LOC 66 | chunk = (chunk_type, chunk_start, index-1) 67 | chunks.append(chunk) 68 | chunk_type, chunk_start = label_chunk_type, index 69 | 70 | return chunks 71 | 72 | 73 | def gen_metrics(true_y, pred_y, label_to_index): 74 | """ 75 | 生成f1值,recall, precision 76 | precision = 识别的正确实体数/识别出的实体数 77 | recall = 识别的正确实体数/样本的实体数 78 | """ 79 | correct_preds = 0 # 识别出的正确实体数 80 | all_preds = 0 # 识别出的实体数 81 | all_trues = 0 # 样本的真实实体数 82 | 83 | true_chunks = get_chunk(true_y.tolist(), label_to_index) 84 | pred_chunks = get_chunk(pred_y.tolist(), label_to_index) 85 | correct_preds += len(set(true_chunks) & set(pred_chunks)) 86 | all_preds += len(pred_chunks) 87 | all_trues += len(true_chunks) 88 | 89 | precision = correct_preds / all_preds if correct_preds > 0 else 0 90 | recall = correct_preds / all_trues if correct_preds > 0 else 0 91 | f1 = 2 * precision * recall / (precision + recall) if correct_preds > 0 else 0 92 | 93 | return round(f1, 4), round(precision, 4), round(recall, 4) -------------------------------------------------------------------------------- /bert_task/classifier_task/predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 5 | 6 | import tensorflow as tf 7 | from model import BertClassifier 8 | from bert import tokenization 9 | 10 | 11 | class Predictor(object): 12 | def __init__(self, config): 13 | self.model = None 14 | self.config = config 15 | 16 | self.output_path = config["output_path"] 17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") 18 | self.label_to_index = self.load_vocab() 19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()} 20 | self.word_vectors = None 21 | self.sequence_length = self.config["sequence_length"] 22 | 23 | # 创建模型 24 | self.create_model() 25 | # 加载计算图 26 | self.load_graph() 27 | 28 | def load_vocab(self): 29 | # 将词汇-索引映射表加载出来 30 | 31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: 32 | label_to_index = json.load(f) 33 | 34 | return label_to_index 35 | 36 | def padding(self, input_id, input_mask, segment_id): 37 | """ 38 | 对序列进行补全 39 | :param input_id: 40 | :param input_mask: 41 | :param segment_id: 42 | :return: 43 | """ 44 | 45 | if len(input_id) < self.sequence_length: 46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) 47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) 48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) 49 | else: 50 | pad_input_id = input_id[:self.sequence_length] 51 | pad_input_mask = input_mask[:self.sequence_length] 52 | pad_segment_id = segment_id[:self.sequence_length] 53 | 54 | return pad_input_id, pad_input_mask, pad_segment_id 55 | 56 | def sentence_to_idx(self, text): 57 | """ 58 | 将分词后的句子转换成idx表示 59 | :return: 60 | """ 61 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) 62 | 63 | text = tokenization.convert_to_unicode(text) 64 | tokens = tokenizer.tokenize(text) 65 | tokens = ["[CLS]"] + tokens + ["[SEP]"] 66 | input_id = tokenizer.convert_tokens_to_ids(tokens) 67 | input_mask = [1] * len(input_id) 68 | segment_id = [0] * len(input_id) 69 | 70 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id) 71 | 72 | return [input_id], [input_mask], [segment_id] 73 | 74 | def load_graph(self): 75 | """ 76 | 加载计算图 77 | :return: 78 | """ 79 | self.sess = tf.Session() 80 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) 81 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 82 | print('Reloading model parameters..') 83 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) 84 | else: 85 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) 86 | 87 | def create_model(self): 88 | """ 89 | 根据config文件选择对应的模型,并初始化 90 | :return: 91 | """ 92 | self.model = BertClassifier(config=self.config, is_training=False) 93 | 94 | def predict(self, text): 95 | """ 96 | 给定分词后的句子,预测其分类结果 97 | :param text: 98 | :return: 99 | """ 100 | input_ids, input_masks, segment_ids = self.sentence_to_idx(text) 101 | 102 | prediction = self.model.infer(self.sess, 103 | dict(input_ids=input_ids, 104 | input_masks=input_masks, 105 | segment_ids=segment_ids)).tolist()[0] 106 | label = self.index_to_label[prediction] 107 | return label 108 | 109 | 110 | -------------------------------------------------------------------------------- /albert_task/classifier_task/predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 5 | 6 | import tensorflow as tf 7 | from model import AlbertClassifier 8 | from albert import tokenization 9 | 10 | 11 | class Predictor(object): 12 | def __init__(self, config): 13 | self.model = None 14 | self.config = config 15 | 16 | self.output_path = config["output_path"] 17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") 18 | self.label_to_index = self.load_vocab() 19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()} 20 | self.word_vectors = None 21 | self.sequence_length = self.config["sequence_length"] 22 | 23 | # 创建模型 24 | self.create_model() 25 | # 加载计算图 26 | self.load_graph() 27 | 28 | def load_vocab(self): 29 | # 将词汇-索引映射表加载出来 30 | 31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: 32 | label_to_index = json.load(f) 33 | 34 | return label_to_index 35 | 36 | def padding(self, input_id, input_mask, segment_id): 37 | """ 38 | 对序列进行补全 39 | :param input_id: 40 | :param input_mask: 41 | :param segment_id: 42 | :return: 43 | """ 44 | 45 | if len(input_id) < self.sequence_length: 46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) 47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) 48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) 49 | else: 50 | pad_input_id = input_id[:self.sequence_length] 51 | pad_input_mask = input_mask[:self.sequence_length] 52 | pad_segment_id = segment_id[:self.sequence_length] 53 | 54 | return pad_input_id, pad_input_mask, pad_segment_id 55 | 56 | def sentence_to_idx(self, text): 57 | """ 58 | 将分词后的句子转换成idx表示 59 | :return: 60 | """ 61 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) 62 | 63 | text = tokenization.convert_to_unicode(text) 64 | tokens = tokenizer.tokenize(text) 65 | tokens = ["[CLS]"] + tokens + ["[SEP]"] 66 | input_id = tokenizer.convert_tokens_to_ids(tokens) 67 | input_mask = [1] * len(input_id) 68 | segment_id = [0] * len(input_id) 69 | 70 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id) 71 | 72 | return [input_id], [input_mask], [segment_id] 73 | 74 | def load_graph(self): 75 | """ 76 | 加载计算图 77 | :return: 78 | """ 79 | self.sess = tf.Session() 80 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) 81 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 82 | print('Reloading model parameters..') 83 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) 84 | else: 85 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) 86 | 87 | def create_model(self): 88 | """ 89 | 根据config文件选择对应的模型,并初始化 90 | :return: 91 | """ 92 | self.model = AlbertClassifier(config=self.config, is_training=False) 93 | 94 | def predict(self, text): 95 | """ 96 | 给定分词后的句子,预测其分类结果 97 | :param text: 98 | :return: 99 | """ 100 | input_ids, input_masks, segment_ids = self.sentence_to_idx(text) 101 | 102 | prediction = self.model.infer(self.sess, 103 | dict(input_ids=input_ids, 104 | input_masks=input_masks, 105 | segment_ids=segment_ids)).tolist()[0] 106 | label = self.index_to_label[prediction] 107 | return label 108 | 109 | 110 | -------------------------------------------------------------------------------- /bert_task/ner_task/predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 5 | 6 | import tensorflow as tf 7 | from model import BertNer 8 | from bert import tokenization 9 | from metrics import get_chunk 10 | 11 | 12 | class Predictor(object): 13 | def __init__(self, config): 14 | self.model = None 15 | self.config = config 16 | 17 | self.output_path = config["output_path"] 18 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") 19 | self.label_to_index = self.load_vocab() 20 | self.word_vectors = None 21 | self.sequence_length = self.config["sequence_length"] 22 | 23 | # 创建模型 24 | self.create_model() 25 | # 加载计算图 26 | self.load_graph() 27 | 28 | def load_vocab(self): 29 | # 将词汇-索引映射表加载出来 30 | 31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: 32 | label_to_index = json.load(f) 33 | 34 | return label_to_index 35 | 36 | def padding(self, input_id, input_mask, segment_id): 37 | """ 38 | 对序列进行补全 39 | :param input_id: 40 | :param input_mask: 41 | :param segment_id: 42 | :return: 43 | """ 44 | 45 | if len(input_id) < self.sequence_length: 46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) 47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) 48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) 49 | sequence_len = len(input_id) 50 | else: 51 | pad_input_id = input_id[:self.sequence_length] 52 | pad_input_mask = input_mask[:self.sequence_length] 53 | pad_segment_id = segment_id[:self.sequence_length] 54 | sequence_len = self.sequence_length 55 | 56 | return pad_input_id, pad_input_mask, pad_segment_id, sequence_len 57 | 58 | def sentence_to_idx(self, text): 59 | """ 60 | 将分词后的句子转换成idx表示 61 | :return: 62 | """ 63 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) 64 | 65 | tokens = [] 66 | for token in text: 67 | token = tokenizer.tokenize(token) 68 | tokens.extend(token) 69 | 70 | tokens = ["[CLS]"] + tokens + ["[SEP]"] 71 | input_id = tokenizer.convert_tokens_to_ids(tokens) 72 | 73 | input_mask = [1] * len(input_id) 74 | segment_id = [0] * len(input_id) 75 | 76 | input_id, input_mask, segment_id, sequence_len = self.padding(input_id, input_mask, segment_id) 77 | 78 | return [input_id], [input_mask], [segment_id], [sequence_len] 79 | 80 | def load_graph(self): 81 | """ 82 | 加载计算图 83 | :return: 84 | """ 85 | self.sess = tf.Session() 86 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) 87 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 88 | print('Reloading model parameters..') 89 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) 90 | else: 91 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) 92 | 93 | def create_model(self): 94 | """ 95 | 根据config文件选择对应的模型,并初始化 96 | :return: 97 | """ 98 | self.model = BertNer(config=self.config, is_training=False) 99 | 100 | def predict(self, text): 101 | """ 102 | 给定分词后的句子,预测其分类结果 103 | :param text: 104 | :return: 105 | """ 106 | input_ids, input_masks, segment_ids, sequence_len = self.sentence_to_idx(text) 107 | 108 | prediction = self.model.infer(self.sess, 109 | dict(input_ids=input_ids, 110 | input_masks=input_masks, 111 | segment_ids=segment_ids, 112 | sequence_len=sequence_len)).tolist() 113 | print(prediction) 114 | chunks = get_chunk(prediction, self.label_to_index) 115 | return chunks 116 | 117 | 118 | -------------------------------------------------------------------------------- /albert_task/ner_task/predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 5 | 6 | import tensorflow as tf 7 | from model import ALBertNer 8 | from albert import tokenization 9 | from metrics import get_chunk 10 | 11 | 12 | class Predictor(object): 13 | def __init__(self, config): 14 | self.model = None 15 | self.config = config 16 | 17 | self.output_path = config["output_path"] 18 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") 19 | self.label_to_index = self.load_vocab() 20 | self.word_vectors = None 21 | self.sequence_length = self.config["sequence_length"] 22 | 23 | # 创建模型 24 | self.create_model() 25 | # 加载计算图 26 | self.load_graph() 27 | 28 | def load_vocab(self): 29 | # 将词汇-索引映射表加载出来 30 | 31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: 32 | label_to_index = json.load(f) 33 | 34 | return label_to_index 35 | 36 | def padding(self, input_id, input_mask, segment_id): 37 | """ 38 | 对序列进行补全 39 | :param input_id: 40 | :param input_mask: 41 | :param segment_id: 42 | :return: 43 | """ 44 | 45 | if len(input_id) < self.sequence_length: 46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) 47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) 48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) 49 | sequence_len = len(input_id) 50 | else: 51 | pad_input_id = input_id[:self.sequence_length] 52 | pad_input_mask = input_mask[:self.sequence_length] 53 | pad_segment_id = segment_id[:self.sequence_length] 54 | sequence_len = self.sequence_length 55 | 56 | return pad_input_id, pad_input_mask, pad_segment_id, sequence_len 57 | 58 | def sentence_to_idx(self, text): 59 | """ 60 | 将分词后的句子转换成idx表示 61 | :return: 62 | """ 63 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) 64 | 65 | tokens = [] 66 | for token in text: 67 | token = tokenizer.tokenize(token) 68 | tokens.extend(token) 69 | 70 | tokens = ["[CLS]"] + tokens + ["[SEP]"] 71 | input_id = tokenizer.convert_tokens_to_ids(tokens) 72 | 73 | input_mask = [1] * len(input_id) 74 | segment_id = [0] * len(input_id) 75 | 76 | input_id, input_mask, segment_id, sequence_len = self.padding(input_id, input_mask, segment_id) 77 | 78 | return [input_id], [input_mask], [segment_id], [sequence_len] 79 | 80 | def load_graph(self): 81 | """ 82 | 加载计算图 83 | :return: 84 | """ 85 | self.sess = tf.Session() 86 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) 87 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 88 | print('Reloading model parameters..') 89 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) 90 | else: 91 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) 92 | 93 | def create_model(self): 94 | """ 95 | 根据config文件选择对应的模型,并初始化 96 | :return: 97 | """ 98 | self.model = ALBertNer(config=self.config, is_training=False) 99 | 100 | def predict(self, text): 101 | """ 102 | 给定分词后的句子,预测其分类结果 103 | :param text: 104 | :return: 105 | """ 106 | input_ids, input_masks, segment_ids, sequence_len = self.sentence_to_idx(text) 107 | 108 | prediction = self.model.infer(self.sess, 109 | dict(input_ids=input_ids, 110 | input_masks=input_masks, 111 | segment_ids=segment_ids, 112 | sequence_len=sequence_len)).tolist() 113 | print(prediction) 114 | chunks = get_chunk(prediction, self.label_to_index) 115 | return chunks 116 | 117 | 118 | -------------------------------------------------------------------------------- /bert_task/bert/sample_text.txt: -------------------------------------------------------------------------------- 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 2 | Text should be one-sentence-per-line, with empty lines between documents. 3 | This sample text is public domain and was randomly selected from Project Guttenberg. 4 | 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. 8 | "Cass" Beard had risen early that morning, but not with a view to discovery. 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. 10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. 11 | This was nearly opposite. 12 | Mr. Cassius crossed the highway, and stopped suddenly. 13 | Something glittered in the nearest red pool before him. 14 | Gold, surely! 15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. 16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass." 17 | Like most of his fellow gold-seekers, Cass was superstitious. 18 | 19 | The fountain of classic wisdom, Hypatia herself. 20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. 21 | From my youth I felt in me a soul above the matter-entangled herd. 22 | She revealed to me the glorious fact, that I am a spark of Divinity itself. 23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. 24 | There is a philosophic pleasure in opening one's treasures to the modest young. 25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. 26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; 27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. 28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. 29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; 30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. 31 | At last they reached the quay at the opposite end of the street; 32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. 33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. 34 | -------------------------------------------------------------------------------- /bert_task/classifier_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.getcwd())) 4 | import tensorflow as tf 5 | 6 | from bert import modeling 7 | from bert import optimization 8 | 9 | 10 | class BertClassifier(object): 11 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 12 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json") 13 | self.__num_classes = config["num_classes"] 14 | self.__learning_rate = config["learning_rate"] 15 | self.__is_training = is_training 16 | self.__num_train_step = num_train_step 17 | self.__num_warmup_step = num_warmup_step 18 | 19 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids') 20 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask') 21 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids') 22 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids") 23 | 24 | self.built_model() 25 | self.init_saver() 26 | 27 | def built_model(self): 28 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 29 | 30 | model = modeling.BertModel(config=bert_config, 31 | is_training=self.__is_training, 32 | input_ids=self.input_ids, 33 | input_mask=self.input_masks, 34 | token_type_ids=self.segment_ids, 35 | use_one_hot_embeddings=False) 36 | output_layer = model.get_pooled_output() 37 | 38 | hidden_size = output_layer.shape[-1].value 39 | if self.__is_training: 40 | # I.e., 0.1 dropout 41 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 42 | 43 | with tf.name_scope("output"): 44 | output_weights = tf.get_variable( 45 | "output_weights", [self.__num_classes, hidden_size], 46 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 47 | 48 | output_bias = tf.get_variable( 49 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer()) 50 | 51 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 52 | logits = tf.nn.bias_add(logits, output_bias) 53 | self.predictions = tf.argmax(logits, axis=-1, name="predictions") 54 | 55 | if self.__is_training: 56 | 57 | with tf.name_scope("loss"): 58 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids) 59 | self.loss = tf.reduce_mean(losses, name="loss") 60 | 61 | with tf.name_scope('train_op'): 62 | self.train_op = optimization.create_optimizer( 63 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 64 | 65 | def init_saver(self): 66 | self.saver = tf.train.Saver(tf.global_variables()) 67 | 68 | def train(self, sess, batch): 69 | """ 70 | 训练模型 71 | :param sess: tf的会话对象 72 | :param batch: batch数据 73 | :return: 损失和预测结果 74 | """ 75 | 76 | feed_dict = {self.input_ids: batch["input_ids"], 77 | self.input_masks: batch["input_masks"], 78 | self.segment_ids: batch["segment_ids"], 79 | self.label_ids: batch["label_ids"]} 80 | 81 | # 训练模型 82 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict) 83 | return loss, predictions 84 | 85 | def eval(self, sess, batch): 86 | """ 87 | 验证模型 88 | :param sess: tf中的会话对象 89 | :param batch: batch数据 90 | :return: 损失和预测结果 91 | """ 92 | feed_dict = {self.input_ids: batch["input_ids"], 93 | self.input_masks: batch["input_masks"], 94 | self.segment_ids: batch["segment_ids"], 95 | self.label_ids: batch["label_ids"]} 96 | 97 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict) 98 | return loss, predictions 99 | 100 | def infer(self, sess, batch): 101 | """ 102 | 预测新数据 103 | :param sess: tf中的会话对象 104 | :param batch: batch数据 105 | :return: 预测结果 106 | """ 107 | feed_dict = {self.input_ids: batch["input_ids"], 108 | self.input_masks: batch["input_masks"], 109 | self.segment_ids: batch["segment_ids"]} 110 | 111 | predict = sess.run(self.predictions, feed_dict=feed_dict) 112 | 113 | return predict 114 | -------------------------------------------------------------------------------- /bert_task/ltr_pair_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | import argparse 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | import tensorflow as tf 9 | from bert import modeling 10 | from model import BertPairLTR 11 | from data_helper import TrainData 12 | from metrics import mean, accuracy 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, args): 17 | self.args = args 18 | with open(args.config_path, "r") as fr: 19 | self.config = json.load(fr) 20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt") 21 | 22 | # 加载数据集 23 | self.data_obj = self.load_data() 24 | self.queries = self.data_obj.gen_data(self.config["data"]) 25 | 26 | print("train data size: {}".format(len(self.queries))) 27 | 28 | num_train_steps = int(self.config["train_n_tasks"] / self.config["batch_size"] * self.config["epochs"]) 29 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 30 | # 初始化模型对象 31 | self.model = self.create_model(num_train_steps, num_warmup_steps) 32 | 33 | def load_data(self): 34 | """ 35 | 创建数据对象 36 | :return: 37 | """ 38 | # 生成训练集对象并生成训练数据 39 | data_obj = TrainData(self.config) 40 | return data_obj 41 | 42 | def create_model(self, num_train_step, num_warmup_step): 43 | """ 44 | 根据config文件选择对应的模型,并初始化 45 | :return: 46 | """ 47 | model = BertPairLTR(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 48 | return model 49 | 50 | def train(self): 51 | with tf.Session() as sess: 52 | tvars = tf.trainable_variables() 53 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 54 | tvars, self.__bert_checkpoint_path) 55 | print("init bert model params") 56 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 57 | print("init bert model params done") 58 | sess.run(tf.variables_initializer(tf.global_variables())) 59 | 60 | current_step = 0 61 | 62 | for epoch in range(self.config["epochs"]): 63 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 64 | t_in_ids_a, t_in_masks_a, t_seg_ids_a, t_in_ids_b, t_in_masks_b, t_seg_ids_b = \ 65 | self.data_obj.gen_task_samples(self.queries, self.config["train_n_tasks"]) 66 | 67 | for batch in self.data_obj.next_batch(t_in_ids_a, t_in_masks_a, t_seg_ids_a, 68 | t_in_ids_b, t_in_masks_b, t_seg_ids_b): 69 | loss, predictions = self.model.train(sess, batch) 70 | acc = accuracy(predictions) 71 | print("train: step: {}, loss: {}, acc: {}".format(current_step, loss, acc)) 72 | 73 | current_step += 1 74 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 75 | e_in_ids_a, e_in_masks_a, e_seg_ids_a, e_in_ids_b, e_in_masks_b, e_seg_ids_b = \ 76 | self.data_obj.gen_task_samples(self.queries, self.config["eval_n_tasks"]) 77 | eval_losses = [] 78 | eval_accs = [] 79 | 80 | for eval_batch in self.data_obj.next_batch(e_in_ids_a, e_in_masks_a, e_seg_ids_a, 81 | e_in_ids_b, e_in_masks_b, e_seg_ids_b): 82 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch) 83 | 84 | eval_losses.append(eval_loss) 85 | 86 | acc = accuracy(eval_predictions) 87 | eval_accs.append(acc) 88 | 89 | print("\n") 90 | print("eval: loss: {}, acc: {}".format(mean(eval_losses), mean(eval_accs))) 91 | print("\n") 92 | 93 | if self.config["ckpt_model_path"]: 94 | save_path = self.config["ckpt_model_path"] 95 | if not os.path.exists(save_path): 96 | os.makedirs(save_path) 97 | model_save_path = os.path.join(save_path, self.config["model_name"]) 98 | self.model.saver.save(sess, model_save_path, global_step=current_step) 99 | 100 | 101 | if __name__ == "__main__": 102 | # 读取用户在命令行输入的信息 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument("--config_path", help="config path of model") 105 | args = parser.parse_args() 106 | trainer = Trainer(args) 107 | trainer.train() 108 | -------------------------------------------------------------------------------- /albert_task/ltr_pair_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | import argparse 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | import tensorflow as tf 9 | from albert import modeling 10 | from model import ALBertPairLTR 11 | from data_helper import TrainData 12 | from metrics import mean, accuracy 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, args): 17 | self.args = args 18 | with open(args.config_path, "r") as fr: 19 | self.config = json.load(fr) 20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt") 21 | 22 | # 加载数据集 23 | self.data_obj = self.load_data() 24 | self.queries = self.data_obj.gen_data(self.config["data"]) 25 | 26 | print("train data size: {}".format(len(self.queries))) 27 | 28 | num_train_steps = int(self.config["train_n_tasks"] / self.config["batch_size"] * self.config["epochs"]) 29 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 30 | # 初始化模型对象 31 | self.model = self.create_model(num_train_steps, num_warmup_steps) 32 | 33 | def load_data(self): 34 | """ 35 | 创建数据对象 36 | :return: 37 | """ 38 | # 生成训练集对象并生成训练数据 39 | data_obj = TrainData(self.config) 40 | return data_obj 41 | 42 | def create_model(self, num_train_step, num_warmup_step): 43 | """ 44 | 根据config文件选择对应的模型,并初始化 45 | :return: 46 | """ 47 | model = ALBertPairLTR(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 48 | return model 49 | 50 | def train(self): 51 | with tf.Session() as sess: 52 | tvars = tf.trainable_variables() 53 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 54 | tvars, self.__bert_checkpoint_path) 55 | print("init bert model params") 56 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 57 | print("init bert model params done") 58 | sess.run(tf.variables_initializer(tf.global_variables())) 59 | 60 | current_step = 0 61 | 62 | for epoch in range(self.config["epochs"]): 63 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 64 | t_in_ids_a, t_in_masks_a, t_seg_ids_a, t_in_ids_b, t_in_masks_b, t_seg_ids_b = \ 65 | self.data_obj.gen_task_samples(self.queries, self.config["train_n_tasks"]) 66 | 67 | for batch in self.data_obj.next_batch(t_in_ids_a, t_in_masks_a, t_seg_ids_a, 68 | t_in_ids_b, t_in_masks_b, t_seg_ids_b): 69 | loss, predictions = self.model.train(sess, batch) 70 | acc = accuracy(predictions) 71 | print("train: step: {}, loss: {}, acc: {}".format(current_step, loss, acc)) 72 | 73 | current_step += 1 74 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 75 | e_in_ids_a, e_in_masks_a, e_seg_ids_a, e_in_ids_b, e_in_masks_b, e_seg_ids_b = \ 76 | self.data_obj.gen_task_samples(self.queries, self.config["eval_n_tasks"]) 77 | eval_losses = [] 78 | eval_accs = [] 79 | 80 | for eval_batch in self.data_obj.next_batch(e_in_ids_a, e_in_masks_a, e_seg_ids_a, 81 | e_in_ids_b, e_in_masks_b, e_seg_ids_b): 82 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch) 83 | 84 | eval_losses.append(eval_loss) 85 | 86 | acc = accuracy(eval_predictions) 87 | eval_accs.append(acc) 88 | 89 | print("\n") 90 | print("eval: loss: {}, acc: {}".format(mean(eval_losses), mean(eval_accs))) 91 | print("\n") 92 | 93 | if self.config["ckpt_model_path"]: 94 | save_path = self.config["ckpt_model_path"] 95 | if not os.path.exists(save_path): 96 | os.makedirs(save_path) 97 | model_save_path = os.path.join(save_path, self.config["model_name"]) 98 | self.model.saver.save(sess, model_save_path, global_step=current_step) 99 | 100 | 101 | if __name__ == "__main__": 102 | # 读取用户在命令行输入的信息 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument("--config_path", help="config path of model") 105 | args = parser.parse_args() 106 | trainer = Trainer(args) 107 | trainer.train() 108 | -------------------------------------------------------------------------------- /albert_task/ltr_point_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_y, true_y): 18 | """ 19 | 计算二类和多类的准确率 20 | :param pred_y: 预测结果 21 | :param true_y: 真实结果 22 | :return: 23 | """ 24 | if isinstance(pred_y[0], list): 25 | pred_y = [item[0] for item in pred_y] 26 | corr = 0 27 | for i in range(len(pred_y)): 28 | if pred_y[i] == true_y[i]: 29 | corr += 1 30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0 31 | return acc 32 | 33 | 34 | def binary_auc(pred_y, true_y): 35 | """ 36 | 二类别的auc值 37 | :param pred_y: 预测结果 38 | :param true_y: 真实结果 39 | :return: 40 | """ 41 | auc = roc_auc_score(true_y, pred_y) 42 | return auc 43 | 44 | 45 | def binary_precision(pred_y, true_y, positive=1): 46 | """ 47 | 二类的精确率计算 48 | :param pred_y: 预测结果 49 | :param true_y: 真实结果 50 | :param positive: 正例的索引表示 51 | :return: 52 | """ 53 | corr = 0 54 | pred_corr = 0 55 | for i in range(len(pred_y)): 56 | if pred_y[i] == positive: 57 | pred_corr += 1 58 | if pred_y[i] == true_y[i]: 59 | corr += 1 60 | 61 | prec = corr / pred_corr if pred_corr > 0 else 0 62 | return prec 63 | 64 | 65 | def binary_recall(pred_y, true_y, positive=1): 66 | """ 67 | 二类的召回率 68 | :param pred_y: 预测结果 69 | :param true_y: 真实结果 70 | :param positive: 正例的索引表示 71 | :return: 72 | """ 73 | corr = 0 74 | true_corr = 0 75 | for i in range(len(pred_y)): 76 | if true_y[i] == positive: 77 | true_corr += 1 78 | if pred_y[i] == true_y[i]: 79 | corr += 1 80 | 81 | rec = corr / true_corr if true_corr > 0 else 0 82 | return rec 83 | 84 | 85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): 86 | """ 87 | 二类的f beta值 88 | :param pred_y: 预测结果 89 | :param true_y: 真实结果 90 | :param beta: beta值 91 | :param positive: 正例的索引表示 92 | :return: 93 | """ 94 | precision = binary_precision(pred_y, true_y, positive) 95 | recall = binary_recall(pred_y, true_y, positive) 96 | try: 97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) 98 | except: 99 | f_b = 0 100 | return f_b 101 | 102 | 103 | def multi_precision(pred_y, true_y, labels): 104 | """ 105 | 多类的精确率 106 | :param pred_y: 预测结果 107 | :param true_y: 真实结果 108 | :param labels: 标签列表 109 | :return: 110 | """ 111 | if isinstance(pred_y[0], list): 112 | pred_y = [item[0] for item in pred_y] 113 | 114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels] 115 | prec = mean(precisions) 116 | return prec 117 | 118 | 119 | def multi_recall(pred_y, true_y, labels): 120 | """ 121 | 多类的召回率 122 | :param pred_y: 预测结果 123 | :param true_y: 真实结果 124 | :param labels: 标签列表 125 | :return: 126 | """ 127 | if isinstance(pred_y[0], list): 128 | pred_y = [item[0] for item in pred_y] 129 | 130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels] 131 | rec = mean(recalls) 132 | return rec 133 | 134 | 135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0): 136 | """ 137 | 多类的f beta值 138 | :param pred_y: 预测结果 139 | :param true_y: 真实结果 140 | :param labels: 标签列表 141 | :param beta: beta值 142 | :return: 143 | """ 144 | if isinstance(pred_y[0], list): 145 | pred_y = [item[0] for item in pred_y] 146 | 147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] 148 | f_beta = mean(f_betas) 149 | return f_beta 150 | 151 | 152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0): 153 | """ 154 | 得到二分类的性能指标 155 | :param pred_y: 156 | :param true_y: 157 | :param f_beta: 158 | :return: 159 | """ 160 | acc = accuracy(pred_y, true_y) 161 | auc = binary_auc(pred_y, true_y) 162 | recall = binary_recall(pred_y, true_y) 163 | precision = binary_precision(pred_y, true_y) 164 | f_beta = binary_f_beta(pred_y, true_y, f_beta) 165 | return acc, auc, recall, precision, f_beta 166 | 167 | 168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): 169 | """ 170 | 得到多分类的性能指标 171 | :param pred_y: 172 | :param true_y: 173 | :param labels: 174 | :param f_beta: 175 | :return: 176 | """ 177 | acc = accuracy(pred_y, true_y) 178 | recall = multi_recall(pred_y, true_y, labels) 179 | precision = multi_precision(pred_y, true_y, labels) 180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) 181 | return acc, recall, precision, f_beta -------------------------------------------------------------------------------- /bert_task/classifier_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_y, true_y): 18 | """ 19 | 计算二类和多类的准确率 20 | :param pred_y: 预测结果 21 | :param true_y: 真实结果 22 | :return: 23 | """ 24 | if isinstance(pred_y[0], list): 25 | pred_y = [item[0] for item in pred_y] 26 | corr = 0 27 | for i in range(len(pred_y)): 28 | if pred_y[i] == true_y[i]: 29 | corr += 1 30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0 31 | return acc 32 | 33 | 34 | def binary_auc(pred_y, true_y): 35 | """ 36 | 二类别的auc值 37 | :param pred_y: 预测结果 38 | :param true_y: 真实结果 39 | :return: 40 | """ 41 | auc = roc_auc_score(true_y, pred_y) 42 | return auc 43 | 44 | 45 | def binary_precision(pred_y, true_y, positive=1): 46 | """ 47 | 二类的精确率计算 48 | :param pred_y: 预测结果 49 | :param true_y: 真实结果 50 | :param positive: 正例的索引表示 51 | :return: 52 | """ 53 | corr = 0 54 | pred_corr = 0 55 | for i in range(len(pred_y)): 56 | if pred_y[i] == positive: 57 | pred_corr += 1 58 | if pred_y[i] == true_y[i]: 59 | corr += 1 60 | 61 | prec = corr / pred_corr if pred_corr > 0 else 0 62 | return prec 63 | 64 | 65 | def binary_recall(pred_y, true_y, positive=1): 66 | """ 67 | 二类的召回率 68 | :param pred_y: 预测结果 69 | :param true_y: 真实结果 70 | :param positive: 正例的索引表示 71 | :return: 72 | """ 73 | corr = 0 74 | true_corr = 0 75 | for i in range(len(pred_y)): 76 | if true_y[i] == positive: 77 | true_corr += 1 78 | if pred_y[i] == true_y[i]: 79 | corr += 1 80 | 81 | rec = corr / true_corr if true_corr > 0 else 0 82 | return rec 83 | 84 | 85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): 86 | """ 87 | 二类的f beta值 88 | :param pred_y: 预测结果 89 | :param true_y: 真实结果 90 | :param beta: beta值 91 | :param positive: 正例的索引表示 92 | :return: 93 | """ 94 | precision = binary_precision(pred_y, true_y, positive) 95 | recall = binary_recall(pred_y, true_y, positive) 96 | try: 97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) 98 | except: 99 | f_b = 0 100 | return f_b 101 | 102 | 103 | def multi_precision(pred_y, true_y, labels): 104 | """ 105 | 多类的精确率 106 | :param pred_y: 预测结果 107 | :param true_y: 真实结果 108 | :param labels: 标签列表 109 | :return: 110 | """ 111 | if isinstance(pred_y[0], list): 112 | pred_y = [item[0] for item in pred_y] 113 | 114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels] 115 | prec = mean(precisions) 116 | return prec 117 | 118 | 119 | def multi_recall(pred_y, true_y, labels): 120 | """ 121 | 多类的召回率 122 | :param pred_y: 预测结果 123 | :param true_y: 真实结果 124 | :param labels: 标签列表 125 | :return: 126 | """ 127 | if isinstance(pred_y[0], list): 128 | pred_y = [item[0] for item in pred_y] 129 | 130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels] 131 | rec = mean(recalls) 132 | return rec 133 | 134 | 135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0): 136 | """ 137 | 多类的f beta值 138 | :param pred_y: 预测结果 139 | :param true_y: 真实结果 140 | :param labels: 标签列表 141 | :param beta: beta值 142 | :return: 143 | """ 144 | if isinstance(pred_y[0], list): 145 | pred_y = [item[0] for item in pred_y] 146 | 147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] 148 | f_beta = mean(f_betas) 149 | return f_beta 150 | 151 | 152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0): 153 | """ 154 | 得到二分类的性能指标 155 | :param pred_y: 156 | :param true_y: 157 | :param f_beta: 158 | :return: 159 | """ 160 | acc = accuracy(pred_y, true_y) 161 | auc = binary_auc(pred_y, true_y) 162 | recall = binary_recall(pred_y, true_y) 163 | precision = binary_precision(pred_y, true_y) 164 | f_beta = binary_f_beta(pred_y, true_y, f_beta) 165 | return acc, auc, recall, precision, f_beta 166 | 167 | 168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): 169 | """ 170 | 得到多分类的性能指标 171 | :param pred_y: 172 | :param true_y: 173 | :param labels: 174 | :param f_beta: 175 | :return: 176 | """ 177 | acc = accuracy(pred_y, true_y) 178 | recall = multi_recall(pred_y, true_y, labels) 179 | precision = multi_precision(pred_y, true_y, labels) 180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) 181 | return acc, recall, precision, f_beta -------------------------------------------------------------------------------- /bert_task/ltr_point_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_y, true_y): 18 | """ 19 | 计算二类和多类的准确率 20 | :param pred_y: 预测结果 21 | :param true_y: 真实结果 22 | :return: 23 | """ 24 | if isinstance(pred_y[0], list): 25 | pred_y = [item[0] for item in pred_y] 26 | corr = 0 27 | for i in range(len(pred_y)): 28 | if pred_y[i] == true_y[i]: 29 | corr += 1 30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0 31 | return acc 32 | 33 | 34 | def binary_auc(pred_y, true_y): 35 | """ 36 | 二类别的auc值 37 | :param pred_y: 预测结果 38 | :param true_y: 真实结果 39 | :return: 40 | """ 41 | auc = roc_auc_score(true_y, pred_y) 42 | return auc 43 | 44 | 45 | def binary_precision(pred_y, true_y, positive=1): 46 | """ 47 | 二类的精确率计算 48 | :param pred_y: 预测结果 49 | :param true_y: 真实结果 50 | :param positive: 正例的索引表示 51 | :return: 52 | """ 53 | corr = 0 54 | pred_corr = 0 55 | for i in range(len(pred_y)): 56 | if pred_y[i] == positive: 57 | pred_corr += 1 58 | if pred_y[i] == true_y[i]: 59 | corr += 1 60 | 61 | prec = corr / pred_corr if pred_corr > 0 else 0 62 | return prec 63 | 64 | 65 | def binary_recall(pred_y, true_y, positive=1): 66 | """ 67 | 二类的召回率 68 | :param pred_y: 预测结果 69 | :param true_y: 真实结果 70 | :param positive: 正例的索引表示 71 | :return: 72 | """ 73 | corr = 0 74 | true_corr = 0 75 | for i in range(len(pred_y)): 76 | if true_y[i] == positive: 77 | true_corr += 1 78 | if pred_y[i] == true_y[i]: 79 | corr += 1 80 | 81 | rec = corr / true_corr if true_corr > 0 else 0 82 | return rec 83 | 84 | 85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): 86 | """ 87 | 二类的f beta值 88 | :param pred_y: 预测结果 89 | :param true_y: 真实结果 90 | :param beta: beta值 91 | :param positive: 正例的索引表示 92 | :return: 93 | """ 94 | precision = binary_precision(pred_y, true_y, positive) 95 | recall = binary_recall(pred_y, true_y, positive) 96 | try: 97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) 98 | except: 99 | f_b = 0 100 | return f_b 101 | 102 | 103 | def multi_precision(pred_y, true_y, labels): 104 | """ 105 | 多类的精确率 106 | :param pred_y: 预测结果 107 | :param true_y: 真实结果 108 | :param labels: 标签列表 109 | :return: 110 | """ 111 | if isinstance(pred_y[0], list): 112 | pred_y = [item[0] for item in pred_y] 113 | 114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels] 115 | prec = mean(precisions) 116 | return prec 117 | 118 | 119 | def multi_recall(pred_y, true_y, labels): 120 | """ 121 | 多类的召回率 122 | :param pred_y: 预测结果 123 | :param true_y: 真实结果 124 | :param labels: 标签列表 125 | :return: 126 | """ 127 | if isinstance(pred_y[0], list): 128 | pred_y = [item[0] for item in pred_y] 129 | 130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels] 131 | rec = mean(recalls) 132 | return rec 133 | 134 | 135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0): 136 | """ 137 | 多类的f beta值 138 | :param pred_y: 预测结果 139 | :param true_y: 真实结果 140 | :param labels: 标签列表 141 | :param beta: beta值 142 | :return: 143 | """ 144 | if isinstance(pred_y[0], list): 145 | pred_y = [item[0] for item in pred_y] 146 | 147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] 148 | f_beta = mean(f_betas) 149 | return f_beta 150 | 151 | 152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0): 153 | """ 154 | 得到二分类的性能指标 155 | :param pred_y: 156 | :param true_y: 157 | :param f_beta: 158 | :return: 159 | """ 160 | acc = accuracy(pred_y, true_y) 161 | auc = binary_auc(pred_y, true_y) 162 | recall = binary_recall(pred_y, true_y) 163 | precision = binary_precision(pred_y, true_y) 164 | f_beta = binary_f_beta(pred_y, true_y, f_beta) 165 | return acc, auc, recall, precision, f_beta 166 | 167 | 168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): 169 | """ 170 | 得到多分类的性能指标 171 | :param pred_y: 172 | :param true_y: 173 | :param labels: 174 | :param f_beta: 175 | :return: 176 | """ 177 | acc = accuracy(pred_y, true_y) 178 | recall = multi_recall(pred_y, true_y, labels) 179 | precision = multi_precision(pred_y, true_y, labels) 180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) 181 | return acc, recall, precision, f_beta -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_y, true_y): 18 | """ 19 | 计算二类和多类的准确率 20 | :param pred_y: 预测结果 21 | :param true_y: 真实结果 22 | :return: 23 | """ 24 | if isinstance(pred_y[0], list): 25 | pred_y = [item[0] for item in pred_y] 26 | corr = 0 27 | for i in range(len(pred_y)): 28 | if pred_y[i] == true_y[i]: 29 | corr += 1 30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0 31 | return acc 32 | 33 | 34 | def binary_auc(pred_y, true_y): 35 | """ 36 | 二类别的auc值 37 | :param pred_y: 预测结果 38 | :param true_y: 真实结果 39 | :return: 40 | """ 41 | auc = roc_auc_score(true_y, pred_y) 42 | return auc 43 | 44 | 45 | def binary_precision(pred_y, true_y, positive=1): 46 | """ 47 | 二类的精确率计算 48 | :param pred_y: 预测结果 49 | :param true_y: 真实结果 50 | :param positive: 正例的索引表示 51 | :return: 52 | """ 53 | corr = 0 54 | pred_corr = 0 55 | for i in range(len(pred_y)): 56 | if pred_y[i] == positive: 57 | pred_corr += 1 58 | if pred_y[i] == true_y[i]: 59 | corr += 1 60 | 61 | prec = corr / pred_corr if pred_corr > 0 else 0 62 | return prec 63 | 64 | 65 | def binary_recall(pred_y, true_y, positive=1): 66 | """ 67 | 二类的召回率 68 | :param pred_y: 预测结果 69 | :param true_y: 真实结果 70 | :param positive: 正例的索引表示 71 | :return: 72 | """ 73 | corr = 0 74 | true_corr = 0 75 | for i in range(len(pred_y)): 76 | if true_y[i] == positive: 77 | true_corr += 1 78 | if pred_y[i] == true_y[i]: 79 | corr += 1 80 | 81 | rec = corr / true_corr if true_corr > 0 else 0 82 | return rec 83 | 84 | 85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): 86 | """ 87 | 二类的f beta值 88 | :param pred_y: 预测结果 89 | :param true_y: 真实结果 90 | :param beta: beta值 91 | :param positive: 正例的索引表示 92 | :return: 93 | """ 94 | precision = binary_precision(pred_y, true_y, positive) 95 | recall = binary_recall(pred_y, true_y, positive) 96 | try: 97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) 98 | except: 99 | f_b = 0 100 | return f_b 101 | 102 | 103 | def multi_precision(pred_y, true_y, labels): 104 | """ 105 | 多类的精确率 106 | :param pred_y: 预测结果 107 | :param true_y: 真实结果 108 | :param labels: 标签列表 109 | :return: 110 | """ 111 | if isinstance(pred_y[0], list): 112 | pred_y = [item[0] for item in pred_y] 113 | 114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels] 115 | prec = mean(precisions) 116 | return prec 117 | 118 | 119 | def multi_recall(pred_y, true_y, labels): 120 | """ 121 | 多类的召回率 122 | :param pred_y: 预测结果 123 | :param true_y: 真实结果 124 | :param labels: 标签列表 125 | :return: 126 | """ 127 | if isinstance(pred_y[0], list): 128 | pred_y = [item[0] for item in pred_y] 129 | 130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels] 131 | rec = mean(recalls) 132 | return rec 133 | 134 | 135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0): 136 | """ 137 | 多类的f beta值 138 | :param pred_y: 预测结果 139 | :param true_y: 真实结果 140 | :param labels: 标签列表 141 | :param beta: beta值 142 | :return: 143 | """ 144 | if isinstance(pred_y[0], list): 145 | pred_y = [item[0] for item in pred_y] 146 | 147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] 148 | f_beta = mean(f_betas) 149 | return f_beta 150 | 151 | 152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0): 153 | """ 154 | 得到二分类的性能指标 155 | :param pred_y: 156 | :param true_y: 157 | :param f_beta: 158 | :return: 159 | """ 160 | acc = accuracy(pred_y, true_y) 161 | auc = binary_auc(pred_y, true_y) 162 | recall = binary_recall(pred_y, true_y) 163 | precision = binary_precision(pred_y, true_y) 164 | f_beta = binary_f_beta(pred_y, true_y, f_beta) 165 | return acc, auc, recall, precision, f_beta 166 | 167 | 168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): 169 | """ 170 | 得到多分类的性能指标 171 | :param pred_y: 172 | :param true_y: 173 | :param labels: 174 | :param f_beta: 175 | :return: 176 | """ 177 | acc = accuracy(pred_y, true_y) 178 | recall = multi_recall(pred_y, true_y, labels) 179 | precision = multi_precision(pred_y, true_y, labels) 180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) 181 | return acc, recall, precision, f_beta -------------------------------------------------------------------------------- /albert_task/classifier_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.getcwd())) 4 | 5 | import tensorflow as tf 6 | 7 | from albert import modeling 8 | from albert import optimization_finetuning as optimization 9 | 10 | 11 | class AlbertClassifier(object): 12 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 13 | self.__bert_config_path = os.path.join(config["bert_model_path"], "albert_config.json") 14 | self.__num_classes = config["num_classes"] 15 | self.__learning_rate = config["learning_rate"] 16 | self.__is_training = is_training 17 | self.__num_train_step = num_train_step 18 | self.__num_warmup_step = num_warmup_step 19 | 20 | self.config = config 21 | 22 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids') 23 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask') 24 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids') 25 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids") 26 | 27 | self.built_model() 28 | self.init_saver() 29 | 30 | def built_model(self): 31 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 32 | 33 | model = modeling.BertModel(config=bert_config, 34 | is_training=self.__is_training, 35 | input_ids=self.input_ids, 36 | input_mask=self.input_masks, 37 | token_type_ids=self.segment_ids, 38 | use_one_hot_embeddings=False) 39 | output_layer = model.get_pooled_output() 40 | 41 | hidden_size = output_layer.shape[-1].value 42 | if self.__is_training: 43 | # I.e., 0.1 dropout 44 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 45 | 46 | with tf.name_scope("output"): 47 | output_weights = tf.get_variable( 48 | "output_weights", [self.__num_classes, hidden_size], 49 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 50 | 51 | output_bias = tf.get_variable( 52 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer()) 53 | 54 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 55 | logits = tf.nn.bias_add(logits, output_bias) 56 | self.predictions = tf.argmax(logits, axis=-1, name="predictions") 57 | 58 | if self.__is_training: 59 | 60 | with tf.name_scope("loss"): 61 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids) 62 | self.loss = tf.reduce_mean(losses, name="loss") 63 | 64 | with tf.name_scope('train_op'): 65 | self.train_op = optimization.create_optimizer( 66 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 67 | 68 | def init_saver(self): 69 | self.saver = tf.train.Saver(tf.global_variables()) 70 | 71 | def train(self, sess, batch): 72 | """ 73 | 训练模型 74 | :param sess: tf的会话对象 75 | :param batch: batch数据 76 | :return: 损失和预测结果 77 | """ 78 | 79 | feed_dict = {self.input_ids: batch["input_ids"], 80 | self.input_masks: batch["input_masks"], 81 | self.segment_ids: batch["segment_ids"], 82 | self.label_ids: batch["label_ids"]} 83 | 84 | # 训练模型 85 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict) 86 | return loss, predictions 87 | 88 | def eval(self, sess, batch): 89 | """ 90 | 验证模型 91 | :param sess: tf中的会话对象 92 | :param batch: batch数据 93 | :return: 损失和预测结果 94 | """ 95 | feed_dict = {self.input_ids: batch["input_ids"], 96 | self.input_masks: batch["input_masks"], 97 | self.segment_ids: batch["segment_ids"], 98 | self.label_ids: batch["label_ids"]} 99 | 100 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict) 101 | return loss, predictions 102 | 103 | def infer(self, sess, batch): 104 | """ 105 | 预测新数据 106 | :param sess: tf中的会话对象 107 | :param batch: batch数据 108 | :return: 预测结果 109 | """ 110 | feed_dict = {self.input_ids: batch["input_ids"], 111 | self.input_masks: batch["input_masks"], 112 | self.segment_ids: batch["segment_ids"]} 113 | 114 | predict = sess.run(self.predictions, feed_dict=feed_dict) 115 | 116 | return predict 117 | -------------------------------------------------------------------------------- /albert_task/classifier_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_y, true_y): 18 | """ 19 | 计算二类和多类的准确率 20 | :param pred_y: 预测结果 21 | :param true_y: 真实结果 22 | :return: 23 | """ 24 | if isinstance(pred_y[0], list): 25 | pred_y = [item[0] for item in pred_y] 26 | corr = 0 27 | for i in range(len(pred_y)): 28 | if pred_y[i] == true_y[i]: 29 | corr += 1 30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0 31 | return acc 32 | 33 | 34 | def binary_auc(pred_y, true_y): 35 | """ 36 | 二类别的auc值 37 | :param pred_y: 预测结果 38 | :param true_y: 真实结果 39 | :return: 40 | """ 41 | auc = roc_auc_score(true_y, pred_y) 42 | return auc 43 | 44 | 45 | def binary_precision(pred_y, true_y, positive=1): 46 | """ 47 | 二类的精确率计算 48 | :param pred_y: 预测结果 49 | :param true_y: 真实结果 50 | :param positive: 正例的索引表示 51 | :return: 52 | """ 53 | corr = 0 54 | pred_corr = 0 55 | for i in range(len(pred_y)): 56 | if pred_y[i] == positive: 57 | pred_corr += 1 58 | if pred_y[i] == true_y[i]: 59 | corr += 1 60 | 61 | prec = corr / pred_corr if pred_corr > 0 else 0 62 | return prec 63 | 64 | 65 | def binary_recall(pred_y, true_y, positive=1): 66 | """ 67 | 二类的召回率 68 | :param pred_y: 预测结果 69 | :param true_y: 真实结果 70 | :param positive: 正例的索引表示 71 | :return: 72 | """ 73 | corr = 0 74 | true_corr = 0 75 | for i in range(len(pred_y)): 76 | if true_y[i] == positive: 77 | true_corr += 1 78 | if pred_y[i] == true_y[i]: 79 | corr += 1 80 | 81 | rec = corr / true_corr if true_corr > 0 else 0 82 | return rec 83 | 84 | 85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): 86 | """ 87 | 二类的f beta值 88 | :param pred_y: 预测结果 89 | :param true_y: 真实结果 90 | :param beta: beta值 91 | :param positive: 正例的索引表示 92 | :return: 93 | """ 94 | precision = binary_precision(pred_y, true_y, positive) 95 | recall = binary_recall(pred_y, true_y, positive) 96 | try: 97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) 98 | except: 99 | f_b = 0 100 | return f_b 101 | 102 | 103 | def multi_precision(pred_y, true_y, labels): 104 | """ 105 | 多类的精确率 106 | :param pred_y: 预测结果 107 | :param true_y: 真实结果 108 | :param labels: 标签列表 109 | :return: 110 | """ 111 | if isinstance(pred_y[0], list): 112 | pred_y = [item[0] for item in pred_y] 113 | 114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels] 115 | prec = mean(precisions) 116 | return prec 117 | 118 | 119 | def multi_recall(pred_y, true_y, labels): 120 | """ 121 | 多类的召回率 122 | :param pred_y: 预测结果 123 | :param true_y: 真实结果 124 | :param labels: 标签列表 125 | :return: 126 | """ 127 | if isinstance(pred_y[0], list): 128 | pred_y = [item[0] for item in pred_y] 129 | 130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels] 131 | rec = mean(recalls) 132 | return rec 133 | 134 | 135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0): 136 | """ 137 | 多类的f beta值 138 | :param pred_y: 预测结果 139 | :param true_y: 真实结果 140 | :param labels: 标签列表 141 | :param beta: beta值 142 | :return: 143 | """ 144 | if isinstance(pred_y[0], list): 145 | pred_y = [item[0] for item in pred_y] 146 | 147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] 148 | f_beta = mean(f_betas) 149 | return f_beta 150 | 151 | 152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0): 153 | """ 154 | 得到二分类的性能指标 155 | :param pred_y: 156 | :param true_y: 157 | :param f_beta: 158 | :return: 159 | """ 160 | pred_y = pred_y.tolist() 161 | acc = accuracy(pred_y, true_y) 162 | auc = binary_auc(pred_y, true_y) 163 | recall = binary_recall(pred_y, true_y) 164 | precision = binary_precision(pred_y, true_y) 165 | f_beta = binary_f_beta(pred_y, true_y, f_beta) 166 | return acc, auc, recall, precision, f_beta 167 | 168 | 169 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): 170 | """ 171 | 得到多分类的性能指标 172 | :param pred_y: 173 | :param true_y: 174 | :param labels: 175 | :param f_beta: 176 | :return: 177 | """ 178 | pred_y = pred_y.tolist() 179 | acc = accuracy(pred_y, true_y) 180 | recall = multi_recall(pred_y, true_y, labels) 181 | precision = multi_precision(pred_y, true_y, labels) 182 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) 183 | return acc, recall, precision, f_beta -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 定义各类性能指标 3 | """ 4 | from sklearn.metrics import roc_auc_score 5 | 6 | 7 | def mean(item: list) -> float: 8 | """ 9 | 计算列表中元素的平均值 10 | :param item: 列表对象 11 | :return: 12 | """ 13 | res = sum(item) / len(item) if len(item) > 0 else 0 14 | return res 15 | 16 | 17 | def accuracy(pred_y, true_y): 18 | """ 19 | 计算二类和多类的准确率 20 | :param pred_y: 预测结果 21 | :param true_y: 真实结果 22 | :return: 23 | """ 24 | 25 | if isinstance(pred_y[0], list): 26 | pred_y = [item[0] for item in pred_y] 27 | corr = 0 28 | for i in range(len(pred_y)): 29 | if pred_y[i] == true_y[i]: 30 | corr += 1 31 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0 32 | return acc 33 | 34 | 35 | def binary_auc(pred_y, true_y): 36 | """ 37 | 二类别的auc值 38 | :param pred_y: 预测结果 39 | :param true_y: 真实结果 40 | :return: 41 | """ 42 | auc = roc_auc_score(true_y, pred_y) 43 | return auc 44 | 45 | 46 | def binary_precision(pred_y, true_y, positive=1): 47 | """ 48 | 二类的精确率计算 49 | :param pred_y: 预测结果 50 | :param true_y: 真实结果 51 | :param positive: 正例的索引表示 52 | :return: 53 | """ 54 | corr = 0 55 | pred_corr = 0 56 | for i in range(len(pred_y)): 57 | if pred_y[i] == positive: 58 | pred_corr += 1 59 | if pred_y[i] == true_y[i]: 60 | corr += 1 61 | 62 | prec = corr / pred_corr if pred_corr > 0 else 0 63 | return prec 64 | 65 | 66 | def binary_recall(pred_y, true_y, positive=1): 67 | """ 68 | 二类的召回率 69 | :param pred_y: 预测结果 70 | :param true_y: 真实结果 71 | :param positive: 正例的索引表示 72 | :return: 73 | """ 74 | corr = 0 75 | true_corr = 0 76 | for i in range(len(pred_y)): 77 | if true_y[i] == positive: 78 | true_corr += 1 79 | if pred_y[i] == true_y[i]: 80 | corr += 1 81 | 82 | rec = corr / true_corr if true_corr > 0 else 0 83 | return rec 84 | 85 | 86 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1): 87 | """ 88 | 二类的f beta值 89 | :param pred_y: 预测结果 90 | :param true_y: 真实结果 91 | :param beta: beta值 92 | :param positive: 正例的索引表示 93 | :return: 94 | """ 95 | precision = binary_precision(pred_y, true_y, positive) 96 | recall = binary_recall(pred_y, true_y, positive) 97 | try: 98 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall) 99 | except: 100 | f_b = 0 101 | return f_b 102 | 103 | 104 | def multi_precision(pred_y, true_y, labels): 105 | """ 106 | 多类的精确率 107 | :param pred_y: 预测结果 108 | :param true_y: 真实结果 109 | :param labels: 标签列表 110 | :return: 111 | """ 112 | if isinstance(pred_y[0], list): 113 | pred_y = [item[0] for item in pred_y] 114 | 115 | precisions = [binary_precision(pred_y, true_y, label) for label in labels] 116 | prec = mean(precisions) 117 | return prec 118 | 119 | 120 | def multi_recall(pred_y, true_y, labels): 121 | """ 122 | 多类的召回率 123 | :param pred_y: 预测结果 124 | :param true_y: 真实结果 125 | :param labels: 标签列表 126 | :return: 127 | """ 128 | if isinstance(pred_y[0], list): 129 | pred_y = [item[0] for item in pred_y] 130 | 131 | recalls = [binary_recall(pred_y, true_y, label) for label in labels] 132 | rec = mean(recalls) 133 | return rec 134 | 135 | 136 | def multi_f_beta(pred_y, true_y, labels, beta=1.0): 137 | """ 138 | 多类的f beta值 139 | :param pred_y: 预测结果 140 | :param true_y: 真实结果 141 | :param labels: 标签列表 142 | :param beta: beta值 143 | :return: 144 | """ 145 | if isinstance(pred_y[0].tolist(), list): 146 | pred_y = [item[0] for item in pred_y] 147 | 148 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels] 149 | f_beta = mean(f_betas) 150 | return f_beta 151 | 152 | 153 | def get_binary_metrics(pred_y, true_y, f_beta=1.0): 154 | """ 155 | 得到二分类的性能指标 156 | :param pred_y: 157 | :param true_y: 158 | :param f_beta: 159 | :return: 160 | """ 161 | pred_y = pred_y.tolist() 162 | acc = accuracy(pred_y, true_y) 163 | auc = binary_auc(pred_y, true_y) 164 | recall = binary_recall(pred_y, true_y) 165 | precision = binary_precision(pred_y, true_y) 166 | f_beta = binary_f_beta(pred_y, true_y, f_beta) 167 | return acc, auc, recall, precision, f_beta 168 | 169 | 170 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0): 171 | """ 172 | 得到多分类的性能指标 173 | :param pred_y: 174 | :param true_y: 175 | :param labels: 176 | :param f_beta: 177 | :return: 178 | """ 179 | pred_y = pred_y.tolist() 180 | acc = accuracy(pred_y, true_y) 181 | recall = multi_recall(pred_y, true_y, labels) 182 | precision = multi_precision(pred_y, true_y, labels) 183 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta) 184 | return acc, recall, precision, f_beta -------------------------------------------------------------------------------- /bert_task/bert/tokenization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import os 20 | import tempfile 21 | import tokenization 22 | import six 23 | import tensorflow as tf 24 | 25 | 26 | class TokenizationTest(tf.test.TestCase): 27 | 28 | def test_full_tokenizer(self): 29 | vocab_tokens = [ 30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 31 | "##ing", "," 32 | ] 33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: 34 | if six.PY2: 35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 36 | else: 37 | vocab_writer.write("".join( 38 | [x + "\n" for x in vocab_tokens]).encode("utf-8")) 39 | 40 | vocab_file = vocab_writer.name 41 | 42 | tokenizer = tokenization.FullTokenizer(vocab_file) 43 | os.unlink(vocab_file) 44 | 45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") 46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 47 | 48 | self.assertAllEqual( 49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 50 | 51 | def test_chinese(self): 52 | tokenizer = tokenization.BasicTokenizer() 53 | 54 | self.assertAllEqual( 55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"), 56 | [u"ah", u"\u535A", u"\u63A8", u"zz"]) 57 | 58 | def test_basic_tokenizer_lower(self): 59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True) 60 | 61 | self.assertAllEqual( 62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 63 | ["hello", "!", "how", "are", "you", "?"]) 64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 65 | 66 | def test_basic_tokenizer_no_lower(self): 67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False) 68 | 69 | self.assertAllEqual( 70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 71 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 72 | 73 | def test_wordpiece_tokenizer(self): 74 | vocab_tokens = [ 75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 76 | "##ing" 77 | ] 78 | 79 | vocab = {} 80 | for (i, token) in enumerate(vocab_tokens): 81 | vocab[token] = i 82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) 83 | 84 | self.assertAllEqual(tokenizer.tokenize(""), []) 85 | 86 | self.assertAllEqual( 87 | tokenizer.tokenize("unwanted running"), 88 | ["un", "##want", "##ed", "runn", "##ing"]) 89 | 90 | self.assertAllEqual( 91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) 92 | 93 | def test_convert_tokens_to_ids(self): 94 | vocab_tokens = [ 95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 96 | "##ing" 97 | ] 98 | 99 | vocab = {} 100 | for (i, token) in enumerate(vocab_tokens): 101 | vocab[token] = i 102 | 103 | self.assertAllEqual( 104 | tokenization.convert_tokens_to_ids( 105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) 106 | 107 | def test_is_whitespace(self): 108 | self.assertTrue(tokenization._is_whitespace(u" ")) 109 | self.assertTrue(tokenization._is_whitespace(u"\t")) 110 | self.assertTrue(tokenization._is_whitespace(u"\r")) 111 | self.assertTrue(tokenization._is_whitespace(u"\n")) 112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0")) 113 | 114 | self.assertFalse(tokenization._is_whitespace(u"A")) 115 | self.assertFalse(tokenization._is_whitespace(u"-")) 116 | 117 | def test_is_control(self): 118 | self.assertTrue(tokenization._is_control(u"\u0005")) 119 | 120 | self.assertFalse(tokenization._is_control(u"A")) 121 | self.assertFalse(tokenization._is_control(u" ")) 122 | self.assertFalse(tokenization._is_control(u"\t")) 123 | self.assertFalse(tokenization._is_control(u"\r")) 124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9")) 125 | 126 | def test_is_punctuation(self): 127 | self.assertTrue(tokenization._is_punctuation(u"-")) 128 | self.assertTrue(tokenization._is_punctuation(u"$")) 129 | self.assertTrue(tokenization._is_punctuation(u"`")) 130 | self.assertTrue(tokenization._is_punctuation(u".")) 131 | 132 | self.assertFalse(tokenization._is_punctuation(u"A")) 133 | self.assertFalse(tokenization._is_punctuation(u" ")) 134 | 135 | 136 | if __name__ == "__main__": 137 | tf.test.main() 138 | -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 5 | 6 | import tensorflow as tf 7 | from model import BertSentencePair 8 | from bert import tokenization 9 | 10 | 11 | class Predictor(object): 12 | def __init__(self, config): 13 | self.model = None 14 | self.config = config 15 | 16 | self.output_path = config["output_path"] 17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") 18 | self.label_to_index = self.load_vocab() 19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()} 20 | self.word_vectors = None 21 | self.sequence_length = self.config["sequence_length"] 22 | 23 | # 创建模型 24 | self.create_model() 25 | # 加载计算图 26 | self.load_graph() 27 | 28 | def load_vocab(self): 29 | # 将词汇-索引映射表加载出来 30 | 31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: 32 | label_to_index = json.load(f) 33 | 34 | return label_to_index 35 | 36 | def _truncate_seq_pair(self, tokens_a, tokens_b, max_len): 37 | """Truncates a sequence pair in place to the maximum length.""" 38 | 39 | # This is a simple heuristic which will always truncate the longer sequence 40 | # one token at a time. This makes more sense than truncating an equal percent 41 | # of tokens from each, since if one sequence is very short then each token 42 | # that's truncated likely contains more information than a longer sequence. 43 | while True: 44 | total_length = len(tokens_a) + len(tokens_b) 45 | if total_length <= max_len: 46 | break 47 | if len(tokens_a) > len(tokens_b): 48 | tokens_a.pop() 49 | else: 50 | tokens_b.pop() 51 | 52 | def padding(self, input_id, input_mask, segment_id): 53 | """ 54 | 对序列进行补全 55 | :param input_id: 56 | :param input_mask: 57 | :param segment_id: 58 | :return: 59 | """ 60 | 61 | if len(input_id) < self.sequence_length: 62 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) 63 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) 64 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) 65 | else: 66 | pad_input_id = input_id[:self.sequence_length] 67 | pad_input_mask = input_mask[:self.sequence_length] 68 | pad_segment_id = segment_id[:self.sequence_length] 69 | 70 | return pad_input_id, pad_input_mask, pad_segment_id 71 | 72 | def sentence_to_idx(self, text_a, text_b): 73 | """ 74 | 将分词后的句子转换成idx表示 75 | :return: 76 | """ 77 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) 78 | 79 | text_a = tokenization.convert_to_unicode(text_a) 80 | text_b = tokenization.convert_to_unicode(text_b) 81 | tokens_a = tokenizer.tokenize(text_a) 82 | tokens_b = tokenizer.tokenize(text_b) 83 | 84 | # 判断两条序列组合在一起长度是否超过最大长度 85 | self._truncate_seq_pair(tokens_a, tokens_b, self.sequence_length - 3) 86 | 87 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] 88 | input_id = tokenizer.convert_tokens_to_ids(tokens) 89 | input_mask = [1] * len(input_id) 90 | segment_id = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) 91 | 92 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id) 93 | 94 | return [input_id], [input_mask], [segment_id] 95 | 96 | def load_graph(self): 97 | """ 98 | 加载计算图 99 | :return: 100 | """ 101 | self.sess = tf.Session() 102 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) 103 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 104 | print('Reloading model parameters..') 105 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) 106 | else: 107 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) 108 | 109 | def create_model(self): 110 | """ 111 | 根据config文件选择对应的模型,并初始化 112 | :return: 113 | """ 114 | self.model = BertSentencePair(config=self.config, is_training=False) 115 | 116 | def predict(self, text_a, text_b): 117 | """ 118 | 给定分词后的句子,预测其分类结果 119 | :param text_a: 120 | :param text_b: 121 | :return: 122 | """ 123 | input_id, input_mask, segment_id = self.sentence_to_idx(text_a, text_b) 124 | 125 | prediction = self.model.infer(self.sess, 126 | dict(input_ids=input_id, 127 | input_masks=input_mask, 128 | segment_ids=segment_id)).tolist()[0][0] 129 | label = self.index_to_label[prediction] 130 | return label 131 | 132 | 133 | -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 5 | 6 | import tensorflow as tf 7 | from model import AlbertSentencePair 8 | from albert import tokenization 9 | 10 | 11 | class Predictor(object): 12 | def __init__(self, config): 13 | self.model = None 14 | self.config = config 15 | 16 | self.output_path = config["output_path"] 17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") 18 | self.label_to_index = self.load_vocab() 19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()} 20 | self.word_vectors = None 21 | self.sequence_length = self.config["sequence_length"] 22 | 23 | # 创建模型 24 | self.create_model() 25 | # 加载计算图 26 | self.load_graph() 27 | 28 | def load_vocab(self): 29 | # 将词汇-索引映射表加载出来 30 | 31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: 32 | label_to_index = json.load(f) 33 | 34 | return label_to_index 35 | 36 | def _truncate_seq_pair(self, tokens_a, tokens_b, max_len): 37 | """Truncates a sequence pair in place to the maximum length.""" 38 | 39 | # This is a simple heuristic which will always truncate the longer sequence 40 | # one token at a time. This makes more sense than truncating an equal percent 41 | # of tokens from each, since if one sequence is very short then each token 42 | # that's truncated likely contains more information than a longer sequence. 43 | while True: 44 | total_length = len(tokens_a) + len(tokens_b) 45 | if total_length <= max_len: 46 | break 47 | if len(tokens_a) > len(tokens_b): 48 | tokens_a.pop() 49 | else: 50 | tokens_b.pop() 51 | 52 | def padding(self, input_id, input_mask, segment_id): 53 | """ 54 | 对序列进行补全 55 | :param input_id: 56 | :param input_mask: 57 | :param segment_id: 58 | :return: 59 | """ 60 | 61 | if len(input_id) < self.sequence_length: 62 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) 63 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) 64 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) 65 | else: 66 | pad_input_id = input_id[:self.sequence_length] 67 | pad_input_mask = input_mask[:self.sequence_length] 68 | pad_segment_id = segment_id[:self.sequence_length] 69 | 70 | return pad_input_id, pad_input_mask, pad_segment_id 71 | 72 | def sentence_to_idx(self, text_a, text_b): 73 | """ 74 | 将分词后的句子转换成idx表示 75 | :return: 76 | """ 77 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) 78 | 79 | text_a = tokenization.convert_to_unicode(text_a) 80 | text_b = tokenization.convert_to_unicode(text_b) 81 | tokens_a = tokenizer.tokenize(text_a) 82 | tokens_b = tokenizer.tokenize(text_b) 83 | 84 | # 判断两条序列组合在一起长度是否超过最大长度 85 | self._truncate_seq_pair(tokens_a, tokens_b, self.sequence_length - 3) 86 | 87 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] 88 | input_id = tokenizer.convert_tokens_to_ids(tokens) 89 | input_mask = [1] * len(input_id) 90 | segment_id = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) 91 | 92 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id) 93 | 94 | return [input_id], [input_mask], [segment_id] 95 | 96 | def load_graph(self): 97 | """ 98 | 加载计算图 99 | :return: 100 | """ 101 | self.sess = tf.Session() 102 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) 103 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 104 | print('Reloading model parameters..') 105 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) 106 | else: 107 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) 108 | 109 | def create_model(self): 110 | """ 111 | 根据config文件选择对应的模型,并初始化 112 | :return: 113 | """ 114 | self.model = AlbertSentencePair(config=self.config, is_training=False) 115 | 116 | def predict(self, text_a, text_b): 117 | """ 118 | 给定分词后的句子,预测其分类结果 119 | :param text_a: 120 | :param text_b: 121 | :return: 122 | """ 123 | input_id, input_mask, segment_id = self.sentence_to_idx(text_a, text_b) 124 | 125 | prediction = self.model.infer(self.sess, 126 | dict(input_ids=input_id, 127 | input_masks=input_mask, 128 | segment_ids=segment_id)).tolist()[0][0] 129 | label = self.index_to_label[prediction] 130 | return label 131 | 132 | 133 | -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.getcwd())) 4 | import tensorflow as tf 5 | 6 | from bert import modeling 7 | from bert import optimization 8 | 9 | 10 | class BertSentencePair(object): 11 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 12 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json") 13 | self.__num_classes = config["num_classes"] 14 | self.__learning_rate = config["learning_rate"] 15 | self.__is_training = is_training 16 | self.__num_train_step = num_train_step 17 | self.__num_warmup_step = num_warmup_step 18 | 19 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids') 20 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask') 21 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids') 22 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids") 23 | 24 | self.built_model() 25 | self.init_saver() 26 | 27 | def built_model(self): 28 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 29 | 30 | model = modeling.BertModel(config=bert_config, 31 | is_training=self.__is_training, 32 | input_ids=self.input_ids, 33 | input_mask=self.input_masks, 34 | token_type_ids=self.segment_ids, 35 | use_one_hot_embeddings=False) 36 | output_layer = model.get_pooled_output() 37 | 38 | hidden_size = output_layer.shape[-1].value 39 | if self.__is_training: 40 | # I.e., 0.1 dropout 41 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 42 | 43 | with tf.name_scope("output"): 44 | output_weights = tf.get_variable( 45 | "output_weights", [self.__num_classes, hidden_size], 46 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 47 | 48 | output_bias = tf.get_variable( 49 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer()) 50 | 51 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 52 | logits = tf.nn.bias_add(logits, output_bias) 53 | if self.__num_classes == 1: 54 | self.predictions = tf.cast(tf.greater_equal(logits, 0.0), dtype=tf.int32, name="predictions") 55 | else: 56 | self.predictions = tf.argmax(logits, axis=-1, name="predictions") 57 | 58 | if self.__is_training: 59 | 60 | with tf.name_scope("loss"): 61 | if self.__num_classes == 1: 62 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(logits, [-1]), 63 | labels=tf.cast(self.label_ids, dtype=tf.float32,)) 64 | else: 65 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids) 66 | self.loss = tf.reduce_mean(losses, name="loss") 67 | 68 | with tf.name_scope('train_op'): 69 | self.train_op = optimization.create_optimizer( 70 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 71 | 72 | def init_saver(self): 73 | self.saver = tf.train.Saver(tf.global_variables()) 74 | 75 | def train(self, sess, batch): 76 | """ 77 | 训练模型 78 | :param sess: tf的会话对象 79 | :param batch: batch数据 80 | :return: 损失和预测结果 81 | """ 82 | 83 | feed_dict = {self.input_ids: batch["input_ids"], 84 | self.input_masks: batch["input_masks"], 85 | self.segment_ids: batch["segment_ids"], 86 | self.label_ids: batch["label_ids"]} 87 | 88 | # 训练模型 89 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict) 90 | return loss, predictions 91 | 92 | def eval(self, sess, batch): 93 | """ 94 | 验证模型 95 | :param sess: tf中的会话对象 96 | :param batch: batch数据 97 | :return: 损失和预测结果 98 | """ 99 | feed_dict = {self.input_ids: batch["input_ids"], 100 | self.input_masks: batch["input_masks"], 101 | self.segment_ids: batch["segment_ids"], 102 | self.label_ids: batch["label_ids"]} 103 | 104 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict) 105 | return loss, predictions 106 | 107 | def infer(self, sess, batch): 108 | """ 109 | 预测新数据 110 | :param sess: tf中的会话对象 111 | :param batch: batch数据 112 | :return: 预测结果 113 | """ 114 | feed_dict = {self.input_ids: batch["input_ids"], 115 | self.input_masks: batch["input_masks"], 116 | self.segment_ids: batch["segment_ids"]} 117 | 118 | predict = sess.run(self.predictions, feed_dict=feed_dict) 119 | 120 | return predict 121 | -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.getcwd())) 4 | import tensorflow as tf 5 | 6 | from albert import modeling 7 | from albert import optimization 8 | 9 | 10 | class AlbertSentencePair(object): 11 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 12 | self.__bert_config_path = os.path.join(config["bert_model_path"], "albert_config.json") 13 | self.__num_classes = config["num_classes"] 14 | self.__learning_rate = config["learning_rate"] 15 | self.__is_training = is_training 16 | self.__num_train_step = num_train_step 17 | self.__num_warmup_step = num_warmup_step 18 | 19 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids') 20 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask') 21 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids') 22 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids") 23 | 24 | self.built_model() 25 | self.init_saver() 26 | 27 | def built_model(self): 28 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 29 | 30 | model = modeling.BertModel(config=bert_config, 31 | is_training=self.__is_training, 32 | input_ids=self.input_ids, 33 | input_mask=self.input_masks, 34 | token_type_ids=self.segment_ids, 35 | use_one_hot_embeddings=False) 36 | output_layer = model.get_pooled_output() 37 | 38 | hidden_size = output_layer.shape[-1].value 39 | if self.__is_training: 40 | # I.e., 0.1 dropout 41 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 42 | 43 | with tf.name_scope("output"): 44 | output_weights = tf.get_variable( 45 | "output_weights", [self.__num_classes, hidden_size], 46 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 47 | 48 | output_bias = tf.get_variable( 49 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer()) 50 | 51 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 52 | logits = tf.nn.bias_add(logits, output_bias) 53 | if self.__num_classes == 1: 54 | self.predictions = tf.cast(tf.greater_equal(logits, 0.0), dtype=tf.int32, name="predictions") 55 | else: 56 | self.predictions = tf.argmax(logits, axis=-1, name="predictions") 57 | 58 | if self.__is_training: 59 | with tf.name_scope("loss"): 60 | if self.__num_classes == 1: 61 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(logits, [-1]), 62 | labels=tf.cast(self.label_ids, dtype=tf.float32)) 63 | else: 64 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids) 65 | self.loss = tf.reduce_mean(losses, name="loss") 66 | 67 | with tf.name_scope('train_op'): 68 | self.train_op = optimization.create_optimizer( 69 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 70 | 71 | def init_saver(self): 72 | self.saver = tf.train.Saver(tf.global_variables()) 73 | 74 | def train(self, sess, batch): 75 | """ 76 | 训练模型 77 | :param sess: tf的会话对象 78 | :param batch: batch数据 79 | :return: 损失和预测结果 80 | """ 81 | 82 | feed_dict = {self.input_ids: batch["input_ids"], 83 | self.input_masks: batch["input_masks"], 84 | self.segment_ids: batch["segment_ids"], 85 | self.label_ids: batch["label_ids"]} 86 | 87 | # 训练模型 88 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict) 89 | return loss, predictions 90 | 91 | def eval(self, sess, batch): 92 | """ 93 | 验证模型 94 | :param sess: tf中的会话对象 95 | :param batch: batch数据 96 | :return: 损失和预测结果 97 | """ 98 | feed_dict = {self.input_ids: batch["input_ids"], 99 | self.input_masks: batch["input_masks"], 100 | self.segment_ids: batch["segment_ids"], 101 | self.label_ids: batch["label_ids"]} 102 | 103 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict) 104 | return loss, predictions 105 | 106 | def infer(self, sess, batch): 107 | """ 108 | 预测新数据 109 | :param sess: tf中的会话对象 110 | :param batch: batch数据 111 | :return: 预测结果 112 | """ 113 | feed_dict = {self.input_ids: batch["input_ids"], 114 | self.input_masks: batch["input_masks"], 115 | self.segment_ids: batch["segment_ids"]} 116 | 117 | predict = sess.run(self.predictions, feed_dict=feed_dict) 118 | 119 | return predict 120 | -------------------------------------------------------------------------------- /albert_task/ner_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.dirname(os.getcwd())) 5 | import tensorflow as tf 6 | 7 | from albert import modeling 8 | from albert import optimization_finetuning as optimization 9 | from bilstm_crf import BiLSTMCRF 10 | 11 | 12 | class ALBertNer(object): 13 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 14 | self.__bert_config_path = os.path.join(config["bert_model_path"], "albert_config.json") 15 | self.__num_classes = config["num_classes"] 16 | self.__learning_rate = config["learning_rate"] 17 | self.__ner_layers = config["ner_layers"] 18 | self.__ner_hidden_sizes = config["ner_hidden_sizes"] 19 | self.__max_len = config["sequence_length"] 20 | self.__is_training = is_training 21 | self.__num_train_step = num_train_step 22 | self.__num_warmup_step = num_warmup_step 23 | 24 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids') 25 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask') 26 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids') 27 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name="label_ids") 28 | self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None], name="sequence_len") 29 | self.keep_prob = tf.placeholder(dtype=tf.float32, shape=None, name="keep_prob") 30 | 31 | self.built_model() 32 | self.init_saver() 33 | 34 | def built_model(self): 35 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 36 | 37 | model = modeling.BertModel(config=bert_config, 38 | is_training=self.__is_training, 39 | input_ids=self.input_ids, 40 | input_mask=self.input_masks, 41 | token_type_ids=self.segment_ids, 42 | use_one_hot_embeddings=False) 43 | 44 | # 获取bert最后一层的输出 45 | output_layer = model.get_sequence_output() 46 | 47 | if self.__is_training: 48 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 49 | 50 | ner_model = BiLSTMCRF(embedded_chars=output_layer, 51 | hidden_sizes=self.__ner_hidden_sizes, 52 | layers=self.__ner_layers, 53 | keep_prob=self.keep_prob, 54 | num_labels=self.__num_classes, 55 | max_len=self.__max_len, 56 | labels=self.label_ids, 57 | sequence_lens=self.sequence_len, 58 | is_training=self.__is_training) 59 | 60 | self.loss, self.true_y, self.predictions = ner_model.construct_graph() 61 | 62 | if self.__is_training: 63 | with tf.name_scope('train_op'): 64 | self.train_op = optimization.create_optimizer( 65 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 66 | 67 | def init_saver(self): 68 | self.saver = tf.train.Saver(tf.global_variables()) 69 | 70 | def train(self, sess, batch, dropout_rate): 71 | """ 72 | 训练模型 73 | :param sess: tf的会话对象 74 | :param batch: batch数据 75 | :param dropout_rate: dropout rate 76 | :return: 损失和预测结果 77 | """ 78 | 79 | feed_dict = {self.input_ids: batch["input_ids"], 80 | self.input_masks: batch["input_masks"], 81 | self.segment_ids: batch["segment_ids"], 82 | self.label_ids: batch["label_ids"], 83 | self.sequence_len: batch["sequence_len"], 84 | self.keep_prob: dropout_rate} 85 | 86 | # 训练模型 87 | _, loss, true_y, predictions = sess.run([self.train_op, self.loss, self.true_y, self.predictions], 88 | feed_dict=feed_dict) 89 | return loss, true_y, predictions 90 | 91 | def eval(self, sess, batch): 92 | """ 93 | 验证模型 94 | :param sess: tf中的会话对象 95 | :param batch: batch数据 96 | :return: 损失和预测结果 97 | """ 98 | feed_dict = {self.input_ids: batch["input_ids"], 99 | self.input_masks: batch["input_masks"], 100 | self.segment_ids: batch["segment_ids"], 101 | self.label_ids: batch["label_ids"], 102 | self.sequence_len: batch["sequence_len"], 103 | self.keep_prob: 1.0} 104 | 105 | loss, true_y, predictions = sess.run([self.loss, self.true_y, self.predictions], feed_dict=feed_dict) 106 | return loss, true_y, predictions 107 | 108 | def infer(self, sess, batch): 109 | """ 110 | 预测新数据 111 | :param sess: tf中的会话对象 112 | :param batch: batch数据 113 | :return: 预测结果 114 | """ 115 | feed_dict = {self.input_ids: batch["input_ids"], 116 | self.input_masks: batch["input_masks"], 117 | self.segment_ids: batch["segment_ids"], 118 | self.sequence_len: batch["sequence_len"], 119 | self.keep_prob: 1.0} 120 | 121 | predict = sess.run(self.predictions, feed_dict=feed_dict) 122 | 123 | return predict 124 | -------------------------------------------------------------------------------- /bert_task/ner_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.dirname(os.getcwd())) 5 | import tensorflow as tf 6 | 7 | from bert import modeling 8 | from bert import optimization 9 | from bilstm_crf import BiLSTMCRF 10 | 11 | 12 | class BertNer(object): 13 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 14 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json") 15 | self.__num_classes = config["num_classes"] 16 | self.__learning_rate = config["learning_rate"] 17 | self.__ner_layers = config["ner_layers"] 18 | self.__ner_hidden_sizes = config["ner_hidden_sizes"] 19 | self.__max_len = config["sequence_length"] 20 | self.__is_training = is_training 21 | self.__num_train_step = num_train_step 22 | self.__num_warmup_step = num_warmup_step 23 | 24 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids') 25 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask') 26 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids') 27 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name="label_ids") 28 | self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None], name="sequence_len") 29 | self.keep_prob = tf.placeholder(dtype=tf.float32, shape=None, name="keep_prob") 30 | 31 | self.built_model() 32 | self.init_saver() 33 | 34 | def built_model(self): 35 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 36 | 37 | model = modeling.BertModel(config=bert_config, 38 | is_training=self.__is_training, 39 | input_ids=self.input_ids, 40 | input_mask=self.input_masks, 41 | token_type_ids=self.segment_ids, 42 | use_one_hot_embeddings=False) 43 | 44 | # 获取bert最后一层的输出 45 | output_layer = model.get_sequence_output() 46 | 47 | hidden_size = output_layer.shape[-1].value 48 | if self.__is_training: 49 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 50 | 51 | ner_model = BiLSTMCRF(embedded_chars=output_layer, 52 | hidden_sizes=self.__ner_hidden_sizes, 53 | layers=self.__ner_layers, 54 | keep_prob=self.keep_prob, 55 | num_labels=self.__num_classes, 56 | max_len=self.__max_len, 57 | labels=self.label_ids, 58 | sequence_lens=self.sequence_len, 59 | is_training=self.__is_training) 60 | 61 | self.loss, self.true_y, self.predictions = ner_model.construct_graph() 62 | 63 | if self.__is_training: 64 | with tf.name_scope('train_op'): 65 | self.train_op = optimization.create_optimizer( 66 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 67 | 68 | def init_saver(self): 69 | self.saver = tf.train.Saver(tf.global_variables()) 70 | 71 | def train(self, sess, batch, dropout_rate): 72 | """ 73 | 训练模型 74 | :param sess: tf的会话对象 75 | :param batch: batch数据 76 | :param dropout_rate: dropout rate 77 | :return: 损失和预测结果 78 | """ 79 | 80 | feed_dict = {self.input_ids: batch["input_ids"], 81 | self.input_masks: batch["input_masks"], 82 | self.segment_ids: batch["segment_ids"], 83 | self.label_ids: batch["label_ids"], 84 | self.sequence_len: batch["sequence_len"], 85 | self.keep_prob: dropout_rate} 86 | 87 | # 训练模型 88 | _, loss, true_y, predictions = sess.run([self.train_op, self.loss, self.true_y, self.predictions], 89 | feed_dict=feed_dict) 90 | return loss, true_y, predictions 91 | 92 | def eval(self, sess, batch): 93 | """ 94 | 验证模型 95 | :param sess: tf中的会话对象 96 | :param batch: batch数据 97 | :return: 损失和预测结果 98 | """ 99 | feed_dict = {self.input_ids: batch["input_ids"], 100 | self.input_masks: batch["input_masks"], 101 | self.segment_ids: batch["segment_ids"], 102 | self.label_ids: batch["label_ids"], 103 | self.sequence_len: batch["sequence_len"], 104 | self.keep_prob: 1.0} 105 | 106 | loss, true_y, predictions = sess.run([self.loss, self.true_y, self.predictions], feed_dict=feed_dict) 107 | return loss, true_y, predictions 108 | 109 | def infer(self, sess, batch): 110 | """ 111 | 预测新数据 112 | :param sess: tf中的会话对象 113 | :param batch: batch数据 114 | :return: 预测结果 115 | """ 116 | feed_dict = {self.input_ids: batch["input_ids"], 117 | self.input_masks: batch["input_masks"], 118 | self.segment_ids: batch["segment_ids"], 119 | self.sequence_len: batch["sequence_len"], 120 | self.keep_prob: 1.0} 121 | 122 | predict = sess.run(self.predictions, feed_dict=feed_dict) 123 | 124 | return predict 125 | -------------------------------------------------------------------------------- /albert_task/albert/bert_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import copy 7 | import json 8 | import math 9 | import re 10 | import six 11 | import tensorflow as tf 12 | 13 | 14 | def get_shape_list(tensor, expected_rank=None, name=None): 15 | """Returns a list of the shape of tensor, preferring static dimensions. 16 | 17 | Args: 18 | tensor: A tf.Tensor object to find the shape of. 19 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 20 | specified and the `tensor` has a different rank, and exception will be 21 | thrown. 22 | name: Optional name of the tensor for the error message. 23 | 24 | Returns: 25 | A list of dimensions of the shape of tensor. All static dimensions will 26 | be returned as python integers, and dynamic dimensions will be returned 27 | as tf.Tensor scalars. 28 | """ 29 | if name is None: 30 | name = tensor.name 31 | 32 | if expected_rank is not None: 33 | assert_rank(tensor, expected_rank, name) 34 | 35 | shape = tensor.shape.as_list() 36 | 37 | non_static_indexes = [] 38 | for (index, dim) in enumerate(shape): 39 | if dim is None: 40 | non_static_indexes.append(index) 41 | 42 | if not non_static_indexes: 43 | return shape 44 | 45 | dyn_shape = tf.shape(tensor) 46 | for index in non_static_indexes: 47 | shape[index] = dyn_shape[index] 48 | return shape 49 | 50 | 51 | def reshape_to_matrix(input_tensor): 52 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 53 | ndims = input_tensor.shape.ndims 54 | if ndims < 2: 55 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 56 | (input_tensor.shape)) 57 | if ndims == 2: 58 | return input_tensor 59 | 60 | width = input_tensor.shape[-1] 61 | output_tensor = tf.reshape(input_tensor, [-1, width]) 62 | return output_tensor 63 | 64 | 65 | def reshape_from_matrix(output_tensor, orig_shape_list): 66 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 67 | if len(orig_shape_list) == 2: 68 | return output_tensor 69 | 70 | output_shape = get_shape_list(output_tensor) 71 | 72 | orig_dims = orig_shape_list[0:-1] 73 | width = output_shape[-1] 74 | 75 | return tf.reshape(output_tensor, orig_dims + [width]) 76 | 77 | 78 | def assert_rank(tensor, expected_rank, name=None): 79 | """Raises an exception if the tensor rank is not of the expected rank. 80 | 81 | Args: 82 | tensor: A tf.Tensor to check the rank of. 83 | expected_rank: Python integer or list of integers, expected rank. 84 | name: Optional name of the tensor for the error message. 85 | 86 | Raises: 87 | ValueError: If the expected shape doesn't match the actual shape. 88 | """ 89 | if name is None: 90 | name = tensor.name 91 | 92 | expected_rank_dict = {} 93 | if isinstance(expected_rank, six.integer_types): 94 | expected_rank_dict[expected_rank] = True 95 | else: 96 | for x in expected_rank: 97 | expected_rank_dict[x] = True 98 | 99 | actual_rank = tensor.shape.ndims 100 | if actual_rank not in expected_rank_dict: 101 | scope_name = tf.get_variable_scope().name 102 | raise ValueError( 103 | "For the tensor `%s` in scope `%s`, the actual rank " 104 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 105 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 106 | 107 | 108 | def gather_indexes(sequence_tensor, positions): 109 | """Gathers the vectors at the specific positions over a minibatch.""" 110 | sequence_shape = get_shape_list(sequence_tensor, expected_rank=3) 111 | batch_size = sequence_shape[0] 112 | seq_length = sequence_shape[1] 113 | width = sequence_shape[2] 114 | 115 | flat_offsets = tf.reshape( 116 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 117 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 118 | flat_sequence_tensor = tf.reshape(sequence_tensor, 119 | [batch_size * seq_length, width]) 120 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 121 | return output_tensor 122 | 123 | 124 | # add sequence mask for: 125 | # 1. random shuffle lm modeling---xlnet with random shuffled input 126 | # 2. left2right and right2left language modeling 127 | # 3. conditional generation 128 | def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs): 129 | if seq_type == 'seq2seq': 130 | if mask_sequence is not None: 131 | seq_shape = get_shape_list(mask_sequence, expected_rank=2) 132 | seq_len = seq_shape[1] 133 | ones = tf.ones((1, seq_len, seq_len)) 134 | a_mask = tf.matrix_band_part(ones, -1, 0) 135 | s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2) 136 | s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3) 137 | a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask 138 | # generate mask of batch x seq_len x seq_len 139 | a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len)) 140 | out_mask = attention_mask * a_mask 141 | else: 142 | ones = tf.ones_like(attention_mask[:1]) 143 | mask = (tf.matrix_band_part(ones, -1, 0)) 144 | out_mask = attention_mask * mask 145 | else: 146 | out_mask = attention_mask 147 | 148 | return out_mask 149 | -------------------------------------------------------------------------------- /albert_task/machine_reading_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import time 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | import tensorflow as tf 9 | from albert import modeling 10 | from model import AlbertMachineReading 11 | from data_helper import TrainData 12 | from metrics import get_eval, write_predictions 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, args): 17 | self.args = args 18 | with open(args.config_path, "r") as fr: 19 | self.config = json.load(fr) 20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt") 21 | 22 | # 加载数据集 23 | self.data_obj = self.load_data() 24 | self.t_features = self.data_obj.gen_data(self.config["train_data"]) 25 | 26 | self.e_examples, self.e_features = self.data_obj.gen_data(self.config["eval_data"], is_training=False) 27 | print("train data size: {}".format(len(self.t_features))) 28 | print("eval data size: {}".format(len(self.e_features))) 29 | 30 | num_train_steps = int( 31 | len(self.t_features) / self.config["batch_size"] * self.config["epochs"]) 32 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 33 | # 初始化模型对象 34 | self.model = self.create_model(num_train_steps, num_warmup_steps) 35 | 36 | def load_data(self): 37 | """ 38 | 创建数据对象 39 | :return: 40 | """ 41 | # 生成训练集对象并生成训练数据 42 | data_obj = TrainData(self.config) 43 | return data_obj 44 | 45 | def create_model(self, num_train_step, num_warmup_step): 46 | """ 47 | 根据config文件选择对应的模型,并初始化 48 | :return: 49 | """ 50 | model = AlbertMachineReading(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 51 | return model 52 | 53 | def train(self): 54 | with tf.Session() as sess: 55 | tvars = tf.trainable_variables() 56 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 57 | tvars, self.__bert_checkpoint_path) 58 | print("init bert model params") 59 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 60 | print("init bert model params done") 61 | sess.run(tf.variables_initializer(tf.global_variables())) 62 | 63 | current_step = 0 64 | start = time.time() 65 | for epoch in range(self.config["epochs"]): 66 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 67 | 68 | for batch in self.data_obj.next_batch(self.t_features): 69 | loss, start_logits, end_logits = self.model.train(sess, batch) 70 | # print("start: ", start_logits) 71 | # print("end: ", end_logits) 72 | print("train: step: {}, loss: {}".format(current_step, loss)) 73 | 74 | current_step += 1 75 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 76 | 77 | all_results = [] 78 | for eval_batch in self.data_obj.next_batch(self.e_features, is_training=False): 79 | start_logits, end_logits = self.model.eval(sess, eval_batch) 80 | 81 | for unique_id, start_logit, end_logit in zip(eval_batch["unique_id"], 82 | start_logits, 83 | end_logits): 84 | all_results.append(dict(unique_id=unique_id, 85 | start_logits=start_logit.tolist(), 86 | end_logits=end_logit.tolist())) 87 | 88 | with open("output/cmrc2018/results.json", "w", encoding="utf8") as fw: 89 | json.dump(all_results, fw, indent=4, ensure_ascii=False) 90 | 91 | write_predictions(all_examples=self.e_examples, 92 | all_features=self.e_features, 93 | all_results=all_results, 94 | n_best_size=self.config["n_best_size"], 95 | max_answer_length=self.config["max_answer_length"], 96 | output_prediction_file=self.config["output_predictions_path"], 97 | output_nbest_file=self.config["output_nbest_path"]) 98 | 99 | result = get_eval(original_file=self.config["eval_data"], 100 | prediction_file=self.config["output_predictions_path"]) 101 | 102 | print("\n") 103 | print("eval: step: {}, f1: {}, em: {}".format(current_step, result["f1"], result["em"])) 104 | print("\n") 105 | 106 | if self.config["ckpt_model_path"]: 107 | save_path = self.config["ckpt_model_path"] 108 | if not os.path.exists(save_path): 109 | os.makedirs(save_path) 110 | model_save_path = os.path.join(save_path, self.config["model_name"]) 111 | self.model.saver.save(sess, model_save_path, global_step=current_step) 112 | 113 | end = time.time() 114 | print("total train time: ", end - start) 115 | 116 | 117 | if __name__ == "__main__": 118 | # 读取用户在命令行输入的信息 119 | parser = argparse.ArgumentParser() 120 | parser.add_argument("--config_path", help="config path of model") 121 | args = parser.parse_args() 122 | trainer = Trainer(args) 123 | trainer.train() 124 | -------------------------------------------------------------------------------- /bert_task/machine_reading_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import time 5 | import collections 6 | import sys 7 | 8 | sys.path.append(os.path.dirname(os.getcwd())) 9 | import tensorflow as tf 10 | from bert import modeling 11 | from model import BertMachineReading 12 | from data_helper import TrainData 13 | from metrics import get_eval, write_predictions 14 | 15 | 16 | class Trainer(object): 17 | def __init__(self, args): 18 | self.args = args 19 | with open(args.config_path, "r") as fr: 20 | self.config = json.load(fr) 21 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt") 22 | 23 | # 加载数据集 24 | self.data_obj = self.load_data() 25 | self.t_features = self.data_obj.gen_data(self.config["train_data"]) 26 | 27 | self.e_examples, self.e_features = self.data_obj.gen_data(self.config["eval_data"], is_training=False) 28 | print("train data size: {}".format(len(self.t_features))) 29 | print("eval data size: {}".format(len(self.e_features))) 30 | 31 | num_train_steps = int( 32 | len(self.t_features) / self.config["batch_size"] * self.config["epochs"]) 33 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 34 | # 初始化模型对象 35 | self.model = self.create_model(num_train_steps, num_warmup_steps) 36 | 37 | def load_data(self): 38 | """ 39 | 创建数据对象 40 | :return: 41 | """ 42 | # 生成训练集对象并生成训练数据 43 | data_obj = TrainData(self.config) 44 | return data_obj 45 | 46 | def create_model(self, num_train_step, num_warmup_step): 47 | """ 48 | 根据config文件选择对应的模型,并初始化 49 | :return: 50 | """ 51 | model = BertMachineReading(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 52 | return model 53 | 54 | def train(self): 55 | with tf.Session() as sess: 56 | tvars = tf.trainable_variables() 57 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 58 | tvars, self.__bert_checkpoint_path) 59 | print("init bert model params") 60 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 61 | print("init bert model params done") 62 | sess.run(tf.variables_initializer(tf.global_variables())) 63 | 64 | current_step = 0 65 | start = time.time() 66 | for epoch in range(self.config["epochs"]): 67 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 68 | 69 | for batch in self.data_obj.next_batch(self.t_features): 70 | loss, start_logits, end_logits = self.model.train(sess, batch) 71 | # print("start: ", start_logits) 72 | # print("end: ", end_logits) 73 | print("train: step: {}, loss: {}".format(current_step, loss)) 74 | 75 | current_step += 1 76 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 77 | 78 | all_results = [] 79 | for eval_batch in self.data_obj.next_batch(self.e_features, is_training=False): 80 | start_logits, end_logits = self.model.eval(sess, eval_batch) 81 | 82 | for unique_id, start_logit, end_logit in zip(eval_batch["unique_id"], 83 | start_logits, 84 | end_logits): 85 | all_results.append(dict(unique_id=unique_id, 86 | start_logits=start_logit.tolist(), 87 | end_logits=end_logit.tolist())) 88 | 89 | with open("output/cmrc2018/results.json", "w", encoding="utf8") as fw: 90 | json.dump(all_results, fw, indent=4, ensure_ascii=False) 91 | 92 | write_predictions(all_examples=self.e_examples, 93 | all_features=self.e_features, 94 | all_results=all_results, 95 | n_best_size=self.config["n_best_size"], 96 | max_answer_length=self.config["max_answer_length"], 97 | output_prediction_file=self.config["output_predictions_path"], 98 | output_nbest_file=self.config["output_nbest_path"]) 99 | 100 | result = get_eval(original_file=self.config["eval_data"], 101 | prediction_file=self.config["output_predictions_path"]) 102 | 103 | print("\n") 104 | print("eval: step: {}, f1: {}, em: {}".format(current_step, result["f1"], result["em"])) 105 | print("\n") 106 | 107 | if self.config["ckpt_model_path"]: 108 | save_path = self.config["ckpt_model_path"] 109 | if not os.path.exists(save_path): 110 | os.makedirs(save_path) 111 | model_save_path = os.path.join(save_path, self.config["model_name"]) 112 | self.model.saver.save(sess, model_save_path, global_step=current_step) 113 | 114 | end = time.time() 115 | print("total train time: ", end - start) 116 | 117 | 118 | if __name__ == "__main__": 119 | # 读取用户在命令行输入的信息 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument("--config_path", help="config path of model") 122 | args = parser.parse_args() 123 | trainer = Trainer(args) 124 | trainer.train() 125 | -------------------------------------------------------------------------------- /bert_task/classifier_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import time 5 | import sys 6 | sys.path.append(os.path.dirname(os.getcwd())) 7 | import tensorflow as tf 8 | from bert import modeling 9 | from model import BertClassifier 10 | from data_helper import TrainData 11 | from metrics import mean, get_multi_metrics 12 | 13 | 14 | class Trainer(object): 15 | def __init__(self, args): 16 | self.args = args 17 | with open(args.config_path, "r") as fr: 18 | self.config = json.load(fr) 19 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt") 20 | 21 | # 加载数据集 22 | self.data_obj = self.load_data() 23 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data( 24 | self.config["train_data"]) 25 | 26 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data( 27 | self.config["eval_data"], is_training=False) 28 | print("train data size: {}".format(len(self.t_lab_ids))) 29 | print("eval data size: {}".format(len(self.e_lab_ids))) 30 | self.label_list = [value for key, value in lab_to_idx.items()] 31 | print("label numbers: ", len(self.label_list)) 32 | 33 | num_train_steps = int( 34 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"]) 35 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 36 | # 初始化模型对象 37 | self.model = self.create_model(num_train_steps, num_warmup_steps) 38 | 39 | def load_data(self): 40 | """ 41 | 创建数据对象 42 | :return: 43 | """ 44 | # 生成训练集对象并生成训练数据 45 | data_obj = TrainData(self.config) 46 | return data_obj 47 | 48 | def create_model(self, num_train_step, num_warmup_step): 49 | """ 50 | 根据config文件选择对应的模型,并初始化 51 | :return: 52 | """ 53 | model = BertClassifier(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 54 | return model 55 | 56 | def train(self): 57 | with tf.Session() as sess: 58 | tvars = tf.trainable_variables() 59 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 60 | tvars, self.__bert_checkpoint_path) 61 | print("init bert model params") 62 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 63 | print("init bert model params done") 64 | sess.run(tf.variables_initializer(tf.global_variables())) 65 | 66 | current_step = 0 67 | start = time.time() 68 | for epoch in range(self.config["epochs"]): 69 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 70 | 71 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids): 72 | loss, predictions = self.model.train(sess, batch) 73 | 74 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"], 75 | labels=self.label_list) 76 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format( 77 | current_step, loss, acc, recall, prec, f_beta)) 78 | 79 | current_step += 1 80 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 81 | 82 | eval_losses = [] 83 | eval_accs = [] 84 | eval_aucs = [] 85 | eval_recalls = [] 86 | eval_precs = [] 87 | eval_f_betas = [] 88 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks, 89 | self.e_seg_ids, self.e_lab_ids): 90 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch) 91 | 92 | eval_losses.append(eval_loss) 93 | 94 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions, 95 | true_y=eval_batch["label_ids"], 96 | labels=self.label_list) 97 | eval_accs.append(acc) 98 | eval_recalls.append(recall) 99 | eval_precs.append(prec) 100 | eval_f_betas.append(f_beta) 101 | print("\n") 102 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format( 103 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls), 104 | mean(eval_precs), mean(eval_f_betas))) 105 | print("\n") 106 | 107 | if self.config["ckpt_model_path"]: 108 | save_path = self.config["ckpt_model_path"] 109 | if not os.path.exists(save_path): 110 | os.makedirs(save_path) 111 | model_save_path = os.path.join(save_path, self.config["model_name"]) 112 | self.model.saver.save(sess, model_save_path, global_step=current_step) 113 | 114 | end = time.time() 115 | print("total train time: ", end - start) 116 | 117 | 118 | if __name__ == "__main__": 119 | # 读取用户在命令行输入的信息 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument("--config_path", help="config path of model") 122 | args = parser.parse_args() 123 | trainer = Trainer(args) 124 | trainer.train() 125 | 126 | -------------------------------------------------------------------------------- /albert_task/sentence_pair_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import sys 5 | import time 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | import tensorflow as tf 9 | from albert import modeling 10 | from model import AlbertSentencePair 11 | from data_helper import TrainData 12 | from metrics import mean, get_multi_metrics 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, args): 17 | self.args = args 18 | with open(args.config_path, "r") as fr: 19 | self.config = json.load(fr) 20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt") 21 | 22 | # 加载数据集 23 | self.data_obj = self.load_data() 24 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data( 25 | self.config["train_data"]) 26 | 27 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data( 28 | self.config["eval_data"], is_training=False) 29 | print("train data size: {}".format(len(self.t_lab_ids))) 30 | print("eval data size: {}".format(len(self.e_lab_ids))) 31 | self.label_list = [value for key, value in lab_to_idx.items()] 32 | print("label numbers: ", len(self.label_list)) 33 | 34 | num_train_steps = int( 35 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"]) 36 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 37 | # 初始化模型对象 38 | self.model = self.create_model(num_train_steps, num_warmup_steps) 39 | 40 | def load_data(self): 41 | """ 42 | 创建数据对象 43 | :return: 44 | """ 45 | # 生成训练集对象并生成训练数据 46 | data_obj = TrainData(self.config) 47 | return data_obj 48 | 49 | def create_model(self, num_train_step, num_warmup_step): 50 | """ 51 | 根据config文件选择对应的模型,并初始化 52 | :return: 53 | """ 54 | model = AlbertSentencePair(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 55 | return model 56 | 57 | def train(self): 58 | with tf.Session() as sess: 59 | tvars = tf.trainable_variables() 60 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 61 | tvars, self.__bert_checkpoint_path) 62 | print("init bert model params") 63 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 64 | print("init bert model params done") 65 | sess.run(tf.variables_initializer(tf.global_variables())) 66 | 67 | current_step = 0 68 | start = time.time() 69 | for epoch in range(self.config["epochs"]): 70 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 71 | 72 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids): 73 | loss, predictions = self.model.train(sess, batch) 74 | 75 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"], 76 | labels=self.label_list) 77 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format( 78 | current_step, loss, acc, recall, prec, f_beta)) 79 | 80 | current_step += 1 81 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 82 | 83 | eval_losses = [] 84 | eval_accs = [] 85 | eval_aucs = [] 86 | eval_recalls = [] 87 | eval_precs = [] 88 | eval_f_betas = [] 89 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks, 90 | self.e_seg_ids, self.e_lab_ids): 91 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch) 92 | 93 | eval_losses.append(eval_loss) 94 | 95 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions, 96 | true_y=eval_batch["label_ids"], 97 | labels=self.label_list) 98 | eval_accs.append(acc) 99 | eval_recalls.append(recall) 100 | eval_precs.append(prec) 101 | eval_f_betas.append(f_beta) 102 | print("\n") 103 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format( 104 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls), 105 | mean(eval_precs), mean(eval_f_betas))) 106 | print("\n") 107 | 108 | if self.config["ckpt_model_path"]: 109 | save_path = self.config["ckpt_model_path"] 110 | if not os.path.exists(save_path): 111 | os.makedirs(save_path) 112 | model_save_path = os.path.join(save_path, self.config["model_name"]) 113 | self.model.saver.save(sess, model_save_path, global_step=current_step) 114 | end = time.time() 115 | print("total train time: ", end - start) 116 | 117 | 118 | if __name__ == "__main__": 119 | # 读取用户在命令行输入的信息 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument("--config_path", help="config path of model") 122 | args = parser.parse_args() 123 | trainer = Trainer(args) 124 | trainer.train() 125 | -------------------------------------------------------------------------------- /bert_task/sentence_pair_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import time 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | import tensorflow as tf 9 | from bert import modeling 10 | from model import BertSentencePair 11 | from data_helper import TrainData 12 | from metrics import mean, get_multi_metrics 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, args): 17 | self.args = args 18 | with open(args.config_path, "r") as fr: 19 | self.config = json.load(fr) 20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt") 21 | 22 | # 加载数据集 23 | self.data_obj = self.load_data() 24 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data( 25 | self.config["train_data"]) 26 | 27 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data( 28 | self.config["eval_data"], is_training=False) 29 | print("train data size: {}".format(len(self.t_lab_ids))) 30 | print("eval data size: {}".format(len(self.e_lab_ids))) 31 | self.label_list = [value for key, value in lab_to_idx.items()] 32 | print("label numbers: ", len(self.label_list)) 33 | 34 | num_train_steps = int( 35 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"]) 36 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 37 | # 初始化模型对象 38 | self.model = self.create_model(num_train_steps, num_warmup_steps) 39 | 40 | def load_data(self): 41 | """ 42 | 创建数据对象 43 | :return: 44 | """ 45 | # 生成训练集对象并生成训练数据 46 | data_obj = TrainData(self.config) 47 | return data_obj 48 | 49 | def create_model(self, num_train_step, num_warmup_step): 50 | """ 51 | 根据config文件选择对应的模型,并初始化 52 | :return: 53 | """ 54 | model = BertSentencePair(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 55 | return model 56 | 57 | def train(self): 58 | with tf.Session() as sess: 59 | tvars = tf.trainable_variables() 60 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 61 | tvars, self.__bert_checkpoint_path) 62 | print("init bert model params") 63 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 64 | print("init bert model params done") 65 | sess.run(tf.variables_initializer(tf.global_variables())) 66 | 67 | current_step = 0 68 | start = time.time() 69 | for epoch in range(self.config["epochs"]): 70 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 71 | 72 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids): 73 | loss, predictions = self.model.train(sess, batch) 74 | 75 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"], 76 | labels=self.label_list) 77 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format( 78 | current_step, loss, acc, recall, prec, f_beta)) 79 | 80 | current_step += 1 81 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 82 | 83 | eval_losses = [] 84 | eval_accs = [] 85 | eval_aucs = [] 86 | eval_recalls = [] 87 | eval_precs = [] 88 | eval_f_betas = [] 89 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks, 90 | self.e_seg_ids, self.e_lab_ids): 91 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch) 92 | 93 | eval_losses.append(eval_loss) 94 | 95 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions, 96 | true_y=eval_batch["label_ids"], 97 | labels=self.label_list) 98 | eval_accs.append(acc) 99 | eval_recalls.append(recall) 100 | eval_precs.append(prec) 101 | eval_f_betas.append(f_beta) 102 | print("\n") 103 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format( 104 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls), 105 | mean(eval_precs), mean(eval_f_betas))) 106 | print("\n") 107 | 108 | if self.config["ckpt_model_path"]: 109 | save_path = self.config["ckpt_model_path"] 110 | if not os.path.exists(save_path): 111 | os.makedirs(save_path) 112 | model_save_path = os.path.join(save_path, self.config["model_name"]) 113 | self.model.saver.save(sess, model_save_path, global_step=current_step) 114 | 115 | end = time.time() 116 | print("total train time: ", end - start) 117 | 118 | 119 | if __name__ == "__main__": 120 | # 读取用户在命令行输入的信息 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument("--config_path", help="config path of model") 123 | args = parser.parse_args() 124 | trainer = Trainer(args) 125 | trainer.train() 126 | -------------------------------------------------------------------------------- /albert_task/classifier_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import time 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | 9 | import tensorflow as tf 10 | from albert import modeling 11 | from model import AlbertClassifier 12 | from data_helper import TrainData 13 | from metrics import mean, get_multi_metrics 14 | 15 | 16 | class Trainer(object): 17 | def __init__(self, args): 18 | self.args = args 19 | with open(args.config_path, "r") as fr: 20 | self.config = json.load(fr) 21 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt") 22 | 23 | # 加载数据集 24 | self.data_obj = self.load_data() 25 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data( 26 | self.config["train_data"]) 27 | 28 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data( 29 | self.config["eval_data"], is_training=False) 30 | print("train data size: {}".format(len(self.t_lab_ids))) 31 | print("eval data size: {}".format(len(self.e_lab_ids))) 32 | self.label_list = [value for key, value in lab_to_idx.items()] 33 | print("label numbers: ", len(self.label_list)) 34 | 35 | num_train_steps = int( 36 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"]) 37 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 38 | # 初始化模型对象 39 | self.model = self.create_model(num_train_steps, num_warmup_steps) 40 | 41 | def load_data(self): 42 | """ 43 | 创建数据对象 44 | :return: 45 | """ 46 | # 生成训练集对象并生成训练数据 47 | data_obj = TrainData(self.config) 48 | return data_obj 49 | 50 | def create_model(self, num_train_step, num_warmup_step): 51 | """ 52 | 根据config文件选择对应的模型,并初始化 53 | :return: 54 | """ 55 | model = AlbertClassifier(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 56 | return model 57 | 58 | def train(self): 59 | with tf.Session() as sess: 60 | tvars = tf.trainable_variables() 61 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 62 | tvars, self.__bert_checkpoint_path) 63 | print("init bert model params") 64 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 65 | print("init bert model params done") 66 | sess.run(tf.variables_initializer(tf.global_variables())) 67 | 68 | current_step = 0 69 | start = time.time() 70 | for epoch in range(self.config["epochs"]): 71 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 72 | 73 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids): 74 | loss, predictions = self.model.train(sess, batch) 75 | 76 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"], 77 | labels=self.label_list) 78 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format( 79 | current_step, loss, acc, recall, prec, f_beta)) 80 | 81 | current_step += 1 82 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 83 | 84 | eval_losses = [] 85 | eval_accs = [] 86 | eval_aucs = [] 87 | eval_recalls = [] 88 | eval_precs = [] 89 | eval_f_betas = [] 90 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks, 91 | self.e_seg_ids, self.e_lab_ids): 92 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch) 93 | 94 | eval_losses.append(eval_loss) 95 | 96 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions, 97 | true_y=eval_batch["label_ids"], 98 | labels=self.label_list) 99 | eval_accs.append(acc) 100 | eval_recalls.append(recall) 101 | eval_precs.append(prec) 102 | eval_f_betas.append(f_beta) 103 | print("\n") 104 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format( 105 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls), 106 | mean(eval_precs), mean(eval_f_betas))) 107 | print("\n") 108 | 109 | if self.config["ckpt_model_path"]: 110 | save_path = self.config["ckpt_model_path"] 111 | if not os.path.exists(save_path): 112 | os.makedirs(save_path) 113 | model_save_path = os.path.join(save_path, self.config["model_name"]) 114 | self.model.saver.save(sess, model_save_path, global_step=current_step) 115 | 116 | end = time.time() 117 | print("total train time: ", end - start) 118 | 119 | 120 | if __name__ == "__main__": 121 | # 读取用户在命令行输入的信息 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument("--config_path", help="config path of model") 124 | args = parser.parse_args() 125 | trainer = Trainer(args) 126 | trainer.train() 127 | 128 | -------------------------------------------------------------------------------- /bert_task/machine_reading_task/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.dirname(os.getcwd())) 5 | import tensorflow as tf 6 | 7 | from bert import modeling 8 | from bert import optimization 9 | 10 | 11 | class BertMachineReading(object): 12 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None): 13 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json") 14 | 15 | self.__is_training = is_training 16 | self.__num_train_step = num_train_step 17 | self.__num_warmup_step = num_warmup_step 18 | 19 | self.__max_length = config["max_length"] 20 | self.__learning_rate = config["learning_rate"] 21 | 22 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, self.__max_length], name='input_ids') 23 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, self.__max_length], name='input_mask') 24 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, self.__max_length], name='segment_ids') 25 | self.start_position = tf.placeholder(dtype=tf.int32, shape=[None], name="start_position") 26 | self.end_position = tf.placeholder(dtype=tf.int32, shape=[None], name="end_position") 27 | 28 | self.built_model() 29 | self.init_saver() 30 | 31 | def built_model(self): 32 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path) 33 | 34 | model = modeling.BertModel(config=bert_config, 35 | is_training=self.__is_training, 36 | input_ids=self.input_ids, 37 | input_mask=self.input_masks, 38 | token_type_ids=self.segment_ids, 39 | use_one_hot_embeddings=False) 40 | 41 | final_hidden = model.get_sequence_output() 42 | 43 | final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) 44 | seq_length = final_hidden_shape[1] 45 | hidden_size = final_hidden_shape[2] 46 | 47 | with tf.name_scope("output"): 48 | output_weights = tf.get_variable( 49 | "output_weights", [2, hidden_size], 50 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 51 | 52 | output_bias = tf.get_variable( 53 | "output_bias", [2], initializer=tf.zeros_initializer()) 54 | 55 | final_hidden_matrix = tf.reshape(final_hidden, 56 | [-1, hidden_size]) 57 | logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) 58 | logits = tf.nn.bias_add(logits, output_bias) 59 | 60 | logits = tf.reshape(logits, [-1, seq_length, 2]) 61 | logits = tf.transpose(logits, [2, 0, 1]) 62 | 63 | unstacked_logits = tf.unstack(logits, axis=0) 64 | 65 | # [batch_size, seq_length] 66 | start_logits, end_logits = (unstacked_logits[0], unstacked_logits[1]) 67 | 68 | self.start_logits = start_logits 69 | self.end_logits = end_logits 70 | 71 | if self.__is_training: 72 | with tf.name_scope("loss"): 73 | start_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=start_logits, 74 | labels=self.start_position) 75 | end_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=end_logits, 76 | labels=self.end_position) 77 | 78 | losses = tf.concat([start_losses, end_losses], axis=0) 79 | self.loss = tf.reduce_mean(losses, name="loss") 80 | 81 | with tf.name_scope('train_op'): 82 | self.train_op = optimization.create_optimizer( 83 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False) 84 | 85 | def init_saver(self): 86 | self.saver = tf.train.Saver(tf.global_variables()) 87 | 88 | def train(self, sess, batch): 89 | """ 90 | 训练模型 91 | :param sess: tf的会话对象 92 | :param batch: batch数据 93 | :return: 损失和预测结果 94 | """ 95 | 96 | feed_dict = {self.input_ids: batch["input_ids"], 97 | self.input_masks: batch["input_masks"], 98 | self.segment_ids: batch["segment_ids"], 99 | self.start_position: batch["start_position"], 100 | self.end_position: batch["end_position"]} 101 | 102 | # 训练模型 103 | _, loss, start_logits, end_logits = sess.run([self.train_op, self.loss, self.start_logits, self.end_logits], 104 | feed_dict=feed_dict) 105 | return loss, start_logits, end_logits 106 | 107 | def eval(self, sess, batch): 108 | """ 109 | 验证模型 110 | :param sess: tf中的会话对象 111 | :param batch: batch数据 112 | :return: 损失和预测结果 113 | """ 114 | feed_dict = {self.input_ids: batch["input_ids"], 115 | self.input_masks: batch["input_masks"], 116 | self.segment_ids: batch["segment_ids"], 117 | self.start_position: batch["start_position"], 118 | self.end_position: batch["end_position"]} 119 | 120 | start_logits, end_logits = sess.run([self.start_logits, self.end_logits], feed_dict=feed_dict) 121 | return start_logits, end_logits 122 | 123 | def infer(self, sess, batch): 124 | """ 125 | 预测新数据 126 | :param sess: tf中的会话对象 127 | :param batch: batch数据 128 | :return: 预测结果 129 | """ 130 | feed_dict = {self.input_ids: batch["input_ids"], 131 | self.input_masks: batch["input_masks"], 132 | self.segment_ids: batch["segment_ids"]} 133 | 134 | start_logits, end_logits = sess.run([self.start_logits, self.end_logits], feed_dict=feed_dict) 135 | 136 | return start_logits, end_logits 137 | -------------------------------------------------------------------------------- /bert_task/ner_task/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import time 5 | import sys 6 | 7 | sys.path.append(os.path.dirname(os.getcwd())) 8 | import tensorflow as tf 9 | from bert import modeling 10 | from model import BertNer 11 | from data_helper import TrainData 12 | from metrics import mean, gen_metrics 13 | 14 | 15 | class Trainer(object): 16 | def __init__(self, args): 17 | self.args = args 18 | with open(args.config_path, "r") as fr: 19 | self.config = json.load(fr) 20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt") 21 | 22 | # 加载数据集 23 | self.data_obj = self.load_data() 24 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, self.t_seq_len, self.lab_to_idx = \ 25 | self.data_obj.gen_data(self.config["train_data"]) 26 | 27 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, self.e_seq_len, self.lab_to_idx = \ 28 | self.data_obj.gen_data(self.config["eval_data"], is_training=False) 29 | 30 | print("train data size: {}".format(len(self.t_lab_ids))) 31 | print("eval data size: {}".format(len(self.e_lab_ids))) 32 | 33 | num_train_steps = int( 34 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"]) 35 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"]) 36 | # 初始化模型对象 37 | self.model = self.create_model(num_train_steps, num_warmup_steps) 38 | 39 | def load_data(self): 40 | """ 41 | 创建数据对象 42 | :return: 43 | """ 44 | # 生成训练集对象并生成训练数据 45 | data_obj = TrainData(self.config) 46 | return data_obj 47 | 48 | def create_model(self, num_train_step, num_warmup_step): 49 | """ 50 | 根据config文件选择对应的模型,并初始化 51 | :return: 52 | """ 53 | model = BertNer(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) 54 | return model 55 | 56 | def train(self): 57 | with tf.Session() as sess: 58 | tvars = tf.trainable_variables() 59 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 60 | tvars, self.__bert_checkpoint_path) 61 | print("init bert model params") 62 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map) 63 | print("init bert model params done") 64 | sess.run(tf.variables_initializer(tf.global_variables())) 65 | 66 | current_step = 0 67 | start = time.time() 68 | for epoch in range(self.config["epochs"]): 69 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"])) 70 | 71 | for batch in self.data_obj.next_batch(self.t_in_ids, 72 | self.t_in_masks, 73 | self.t_seg_ids, 74 | self.t_lab_ids, 75 | self.t_seq_len): 76 | 77 | loss, true_y, predictions = self.model.train(sess, batch, self.config["keep_prob"]) 78 | 79 | f1, precision, recall = gen_metrics(pred_y=predictions, true_y=true_y, 80 | label_to_index=self.lab_to_idx) 81 | print("train: step: {}, loss: {}, recall: {}, precision: {}, f1: {}".format( 82 | current_step, loss, recall, precision, f1)) 83 | 84 | current_step += 1 85 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0: 86 | 87 | eval_losses = [] 88 | eval_recalls = [] 89 | eval_precisions = [] 90 | eval_f1s = [] 91 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, 92 | self.e_in_masks, 93 | self.e_seg_ids, 94 | self.e_lab_ids, 95 | self.e_seq_len): 96 | eval_loss, eval_true_y, eval_predictions = self.model.eval(sess, eval_batch) 97 | 98 | eval_losses.append(eval_loss) 99 | 100 | f1, precision, recall = gen_metrics(pred_y=eval_predictions, 101 | true_y=eval_true_y, 102 | label_to_index=self.lab_to_idx) 103 | eval_recalls.append(recall) 104 | eval_precisions.append(precision) 105 | eval_f1s.append(f1) 106 | print("\n") 107 | print("eval: loss: {}, recall: {}, precision: {}, f1: {}".format( 108 | mean(eval_losses), mean(eval_recalls), 109 | mean(eval_precisions), mean(eval_f1s))) 110 | print("\n") 111 | 112 | if self.config["ckpt_model_path"]: 113 | save_path = self.config["ckpt_model_path"] 114 | if not os.path.exists(save_path): 115 | os.makedirs(save_path) 116 | model_save_path = os.path.join(save_path, self.config["model_name"]) 117 | self.model.saver.save(sess, model_save_path, global_step=current_step) 118 | 119 | end = time.time() 120 | print("total train time: ", end - start) 121 | 122 | 123 | if __name__ == "__main__": 124 | # 读取用户在命令行输入的信息 125 | parser = argparse.ArgumentParser() 126 | parser.add_argument("--config_path", help="config path of model") 127 | args = parser.parse_args() 128 | trainer = Trainer(args) 129 | trainer.train() 130 | --------------------------------------------------------------------------------