├── __init__.py
├── albert_task
├── __init__.py
├── albert
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── modeling.cpython-36.pyc
│ │ ├── bert_utils.cpython-36.pyc
│ │ ├── optimization.cpython-36.pyc
│ │ ├── tokenization.cpython-36.pyc
│ │ └── optimization_finetuning.cpython-36.pyc
│ ├── create_pretrain_data.sh
│ ├── run.sh
│ ├── args.py
│ ├── test_changes.py
│ └── bert_utils.py
├── ner_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ ├── bilstm_crf.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── test.py
│ ├── config
│ │ └── msraner_config.json
│ ├── README.md
│ ├── metrics.py
│ ├── predict.py
│ └── model.py
├── ltr_pair_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config.json
│ ├── metrics.py
│ ├── README.md
│ └── trainer.py
├── classifier_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── test.py
│ ├── config
│ │ ├── inews_config.json
│ │ ├── tnews_config.json
│ │ └── thucnews_config.json
│ ├── README.md
│ ├── predict.py
│ ├── model.py
│ ├── metrics.py
│ └── trainer.py
├── sentence_pair_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── test.py
│ ├── config
│ │ ├── bq_config.json
│ │ ├── xnli_config.json
│ │ └── lcqmc_config.json
│ ├── README.md
│ ├── metrics.py
│ ├── predict.py
│ ├── model.py
│ └── trainer.py
├── machine_reading_task
│ ├── __init__.py
│ ├── config
│ │ └── cmrc_config.json
│ ├── test.py
│ └── trainer.py
└── ltr_point_task
│ ├── run.sh
│ ├── config.json
│ ├── README.md
│ └── metrics.py
├── bert_task
├── __init__.py
├── ner_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ ├── bilstm_crf.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config
│ │ └── msraner_config.json
│ ├── README.md
│ ├── metrics.py
│ ├── predict.py
│ ├── model.py
│ └── trainer.py
├── classifier_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config
│ │ ├── inews_config.json
│ │ ├── tnews_config.json
│ │ └── thucnews_config.json
│ ├── README.md
│ ├── predict.py
│ ├── model.py
│ ├── metrics.py
│ └── trainer.py
├── ltr_pair_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config.json
│ ├── metrics.py
│ ├── README.md
│ └── trainer.py
├── ltr_point_task
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── metrics.cpython-36.pyc
│ │ ├── model.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config.json
│ ├── README.md
│ └── metrics.py
├── machine_reading_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config
│ │ └── cmrc_config.json
│ ├── README.md
│ ├── trainer.py
│ └── model.py
├── sentence_pair_task
│ ├── __init__.py
│ ├── run.sh
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── metrics.cpython-36.pyc
│ │ └── data_helper.cpython-36.pyc
│ ├── config
│ │ ├── bq_config.json
│ │ ├── lcqmc_config.json
│ │ └── xnli_config.json
│ ├── README.md
│ ├── metrics.py
│ ├── predict.py
│ ├── model.py
│ └── trainer.py
└── bert
│ ├── requirements.txt
│ ├── __init__.py
│ ├── CONTRIBUTING.md
│ ├── optimization_test.py
│ ├── .gitignore
│ ├── sample_text.txt
│ └── tokenization_test.py
└── README.md
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/albert/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/ner_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/ner_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/classifier_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/classifier_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/machine_reading_task/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/albert_task/ltr_point_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config.json
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config.json
2 |
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config.json
2 |
--------------------------------------------------------------------------------
/bert_task/ner_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/msraner_config.json
--------------------------------------------------------------------------------
/albert_task/ner_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/msraner_config.json
--------------------------------------------------------------------------------
/bert_task/classifier_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/tnews_config.json
--------------------------------------------------------------------------------
/albert_task/classifier_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/tnews_config.json
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/tnews_config.json
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/bq_config.json
2 |
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/run.sh:
--------------------------------------------------------------------------------
1 | python trainer.py --config_path=config/bq_config.json
2 |
--------------------------------------------------------------------------------
/bert_task/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow.
2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.
3 |
--------------------------------------------------------------------------------
/bert_task/ner_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/albert/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/albert/__pycache__/modeling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/modeling.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ner_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ner_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/albert/__pycache__/bert_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/bert_utils.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ner_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/albert/__pycache__/optimization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/optimization.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/albert/__pycache__/tokenization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/tokenization.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ltr_pair_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/bilstm_crf.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ner_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ner_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/classifier_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/classifier_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_point_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_point_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ner_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ner_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/classifier_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/classifier_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ltr_pair_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/classifier_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/classifier_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/classifier_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/classifier_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/sentence_pair_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_pair_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/ltr_point_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/machine_reading_task/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/classifier_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/sentence_pair_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/__pycache__/metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/machine_reading_task/__pycache__/metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/albert/__pycache__/optimization_finetuning.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/albert/__pycache__/optimization_finetuning.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/albert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/machine_reading_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangxinyang227/bert-for-task/HEAD/bert_task/sentence_pair_task/__pycache__/data_helper.cpython-36.pyc
--------------------------------------------------------------------------------
/albert_task/classifier_task/test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from predict import Predictor
4 |
5 |
6 | with open("config/tnews_config.json", "r") as fr:
7 | config = json.load(fr)
8 |
9 |
10 | predictor = Predictor(config)
11 | text = "歼20座舱盖上的两条“花纹”是什么?"
12 | res = predictor.predict(text)
13 | print(res)
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from predict import Predictor
4 |
5 |
6 | with open("config/bq_config.json", "r") as fr:
7 | config = json.load(fr)
8 |
9 |
10 | predictor = Predictor(config)
11 |
12 | text_a = "为什么我无法看到额度"
13 | text_b = "为什么开通了却没有额度"
14 | res = predictor.predict(text_a, text_b)
15 | print(res)
--------------------------------------------------------------------------------
/albert_task/albert/create_pretrain_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | BERT_BASE_DIR=./albert_config
4 | python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=data/news_zh_1.txt \
5 | --output_file=data/tf_news_2016_zh_raw_news2016zh_1.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \
6 | --max_seq_length=512 --max_predictions_per_seq=51 --masked_lm_prob=0.10
--------------------------------------------------------------------------------
/albert_task/ner_task/test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from predict import Predictor
4 |
5 | with open("config/msraner_config.json", "r") as fr:
6 | config = json.load(fr)
7 |
8 | text = "中 共 中 央 致 中 国 致 公 党 十 一 大 的 贺 词"
9 | text = text.split(" ")
10 | predictor = Predictor(config)
11 | chunks = predictor.predict(text)
12 |
13 | for chunk in chunks:
14 | entity_name, start, end = chunk
15 | entity = "".join(text[start - 1: end])
16 | print(entity_name, entity)
17 |
--------------------------------------------------------------------------------
/albert_task/ltr_point_task/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "ltr_point",
3 | "epochs": 5,
4 | "checkpoint_every": 2000,
5 | "eval_every": 2000,
6 | "learning_rate": 3e-5,
7 | "sequence_length": 64,
8 | "batch_size": 16,
9 | "neg_threshold": 0.4,
10 | "warmup_rate": 0.1,
11 | "output_path": "output",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/lcqmc/train.tsv",
14 | "eval_data": "data/lcqmc/dev.tsv",
15 | "ckpt_model_path": "ckpt_model/lcqmc"
16 | }
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "ltr_point",
3 | "epochs": 5,
4 | "checkpoint_every": 10000,
5 | "eval_every": 10000,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 64,
8 | "batch_size": 16,
9 | "neg_threshold": 0.4,
10 | "warmup_rate": 0.1,
11 | "output_path": "output",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/dssm/train.tsv",
14 | "eval_data": "data/dssm/dev.tsv",
15 | "ckpt_model_path": "ckpt_model/"
16 | }
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/config/bq_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "sentence_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 1,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/bq",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/bq/dev.txt",
14 | "eval_data": "data/bq/dev.txt",
15 | "ckpt_model_path": "ckpt_model/bq"
16 | }
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "ltr_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 500,
5 | "eval_every": 500,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 32,
8 | "batch_size": 8,
9 | "num_samples": 2,
10 | "train_n_tasks": 1000,
11 | "eval_n_tasks": 500,
12 | "margin": 0.5,
13 | "warmup_rate": 0.1,
14 | "output_path": "output",
15 | "bert_model_path": "../albert_model/albert_tiny",
16 | "data": "data/data.json",
17 | "ckpt_model_path": "ckpt_model/"
18 | }
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 |
4 | * model_name:模型名称
5 | * epochs:迭代epoch的数量
6 | * checkpoint_every:间隔多少步保存一次模型
7 | * eval_every:间隔多少步验证一次模型
8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
9 | * sequence_length:序列长度,单GPU时不要超过128
10 | * batch_size:单GPU时不要超过32
11 | * neg_threshold:对比损失中的负样本临界值
12 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
13 | * output_path:输出文件夹,用来存储label_to_index等文件
14 | * bert_model_path:预训练模型文件夹路径
15 | * train_data:训练数据路径
16 | * eval_data:验证数据路径
17 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/config/bq_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "sentence_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 1,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/bq",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/bq/dev.txt",
14 | "eval_data": "data/bq/dev.txt",
15 | "ckpt_model_path": "ckpt_model/bq"
16 | }
--------------------------------------------------------------------------------
/albert_task/classifier_task/config/inews_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "classifier",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 3,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/inews",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/inews/dev.txt",
14 | "eval_data": "data/inews/dev.txt",
15 | "ckpt_model_path": "ckpt_model/inews"
16 | }
--------------------------------------------------------------------------------
/albert_task/classifier_task/config/tnews_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "classifier",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 15,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/tnews",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/tnews/dev.txt",
14 | "eval_data": "data/tnews/dev.txt",
15 | "ckpt_model_path": "ckpt_model/tnews"
16 | }
--------------------------------------------------------------------------------
/albert_task/ltr_point_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 |
4 | * model_name:模型名称
5 | * epochs:迭代epoch的数量
6 | * checkpoint_every:间隔多少步保存一次模型
7 | * eval_every:间隔多少步验证一次模型
8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
9 | * sequence_length:序列长度,单GPU时不要超过128
10 | * batch_size:单GPU时不要超过32
11 | * neg_threshold:对比损失中的负样本临界值
12 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
13 | * output_path:输出文件夹,用来存储label_to_index等文件
14 | * bert_model_path:预训练模型文件夹路径
15 | * train_data:训练数据路径
16 | * eval_data:验证数据路径
17 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/config/xnli_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "sentence_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 10000,
5 | "eval_every": 10000,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 3,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/xnli",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/xnli/dev.txt",
14 | "eval_data": "data/xnli/dev.txt",
15 | "ckpt_model_path": "ckpt_model/xnli"
16 | }
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/config/lcqmc_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "sentence_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 1,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/lcqmc",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/lcqmc/dev.txt",
14 | "eval_data": "data/lcqmc/dev.txt",
15 | "ckpt_model_path": "ckpt_model/lcqmc"
16 | }
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "ltr_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 500,
5 | "eval_every": 500,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 32,
8 | "batch_size": 8,
9 | "num_samples": 2,
10 | "train_n_tasks": 100000,
11 | "eval_n_tasks": 500,
12 | "margin": 0.5,
13 | "warmup_rate": 0.1,
14 | "output_path": "output",
15 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
16 | "data": "data/data.json",
17 | "ckpt_model_path": "ckpt_model/"
18 | }
--------------------------------------------------------------------------------
/bert_task/classifier_task/config/inews_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "classifier",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 3,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/inews",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/inews/train.txt",
14 | "eval_data": "data/inews/dev.txt",
15 | "ckpt_model_path": "ckpt_model/inews"
16 | }
--------------------------------------------------------------------------------
/bert_task/classifier_task/config/tnews_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "classifier",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 15,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/tnews",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/tnews/train.txt",
14 | "eval_data": "data/tnews/dev.txt",
15 | "ckpt_model_path": "ckpt_model/tnews"
16 | }
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/config/lcqmc_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "sentence_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 1,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/lcqmc",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/lcqmc/dev.txt",
14 | "eval_data": "data/lcqmc/dev.txt",
15 | "ckpt_model_path": "ckpt_model/lcqmc"
16 | }
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/config/xnli_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "sentence_pair",
3 | "epochs": 5,
4 | "checkpoint_every": 10000,
5 | "eval_every": 10000,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 3,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/xnli",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/xnli/dev.txt",
14 | "eval_data": "data/xnli/dev.txt",
15 | "ckpt_model_path": "ckpt_model/xnli"
16 | }
--------------------------------------------------------------------------------
/albert_task/classifier_task/config/thucnews_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "classifier",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 14,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/thucnews",
12 | "bert_model_path": "../albert_model/albert_tiny",
13 | "train_data": "data/thucnews/dev.txt",
14 | "eval_data": "data/thucnews/dev.txt",
15 | "ckpt_model_path": "ckpt_model/thucnews"
16 | }
--------------------------------------------------------------------------------
/bert_task/classifier_task/config/thucnews_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "classifier",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "sequence_length": 128,
8 | "batch_size": 32,
9 | "num_classes": 14,
10 | "warmup_rate": 0.1,
11 | "output_path": "output/thucnews",
12 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
13 | "train_data": "data/thucnews/train.txt",
14 | "eval_data": "data/thucnews/dev.txt",
15 | "ckpt_model_path": "ckpt_model/thucnews"
16 | }
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### 以bq_config.json为例
4 |
5 | * model_name:模型名称
6 | * epochs:迭代epoch的数量
7 | * checkpoint_every:间隔多少步保存一次模型
8 | * eval_every:间隔多少步验证一次模型
9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
10 | * sequence_length:序列长度,单GPU时不要超过128
11 | * batch_size:单GPU时不要超过32
12 | * num_classes:类别数量,若是二分类设置为1
13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
14 | * output_path:输出文件夹,用来存储label_to_index等文件
15 | * bert_model_path:预训练模型文件夹路径
16 | * train_data:训练数据路径
17 | * eval_data:验证数据路径
18 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### 以bq_config.json为例
4 |
5 | * model_name:模型名称
6 | * epochs:迭代epoch的数量
7 | * checkpoint_every:间隔多少步保存一次模型
8 | * eval_every:间隔多少步验证一次模型
9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
10 | * sequence_length:序列长度,单GPU时不要超过128
11 | * batch_size:单GPU时不要超过32
12 | * num_classes:类别数量,若是二分类设置为1
13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
14 | * output_path:输出文件夹,用来存储label_to_index等文件
15 | * bert_model_path:预训练模型文件夹路径
16 | * train_data:训练数据路径
17 | * eval_data:验证数据路径
18 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/albert_task/classifier_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### 以inews_config.json为例
4 |
5 | * model_name:模型名称
6 | * epochs:迭代epoch的数量
7 | * checkpoint_every:间隔多少步保存一次模型
8 | * eval_every:间隔多少步验证一次模型
9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
10 | * sequence_length:序列长度,单GPU时不要超过128
11 | * batch_size:单GPU时不要超过32
12 | * num_classes:文本分类的类别数量,若是二分类设置为1
13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
14 | * output_path:输出文件夹,用来存储label_to_index等文件
15 | * bert_model_path:预训练模型文件夹路径
16 | * train_data:训练数据路径
17 | * eval_data:验证数据路径
18 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/bert_task/classifier_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### 以inews_config.json为例
4 |
5 | * model_name:模型名称
6 | * epochs:迭代epoch的数量
7 | * checkpoint_every:间隔多少步保存一次模型
8 | * eval_every:间隔多少步验证一次模型
9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
10 | * sequence_length:序列长度,单GPU时不要超过128
11 | * batch_size:单GPU时不要超过32
12 | * num_classes:文本分类的类别数量,若是二分类设置为1
13 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
14 | * output_path:输出文件夹,用来存储label_to_index等文件
15 | * bert_model_path:预训练模型文件夹路径
16 | * train_data:训练数据路径
17 | * eval_data:验证数据路径
18 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_ys):
18 | """
19 |
20 | :param pred_ys:
21 | :return:
22 | """
23 | correct = 0
24 | for pred_y in pred_ys:
25 | if pred_y == 0:
26 | correct += 1
27 |
28 | return round(correct / len(pred_ys), 4)
--------------------------------------------------------------------------------
/albert_task/ner_task/config/msraner_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "ner",
3 | "epochs": 5,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 128,
8 | "ner_layers": [128],
9 | "ner_hidden_sizes": [128],
10 | "batch_size": 16,
11 | "num_classes": 7,
12 | "keep_prob": 0.9,
13 | "warmup_rate": 0.1,
14 | "output_path": "output",
15 | "bert_model_path": "../albert_model/albert_tiny",
16 | "train_data": "data/msraner/test.txt",
17 | "eval_data": "data/msraner/test.txt",
18 | "ckpt_model_path": "ckpt_model/"
19 | }
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_ys):
18 | """
19 |
20 | :param pred_ys:
21 | :return:
22 | """
23 | correct = 0
24 | for pred_y in pred_ys:
25 | if pred_y == 0:
26 | correct += 1
27 |
28 | return round(correct / len(pred_ys), 4)
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 |
4 | * model_name:模型名称
5 | * epochs:迭代epoch的数量
6 | * checkpoint_every:间隔多少步保存一次模型
7 | * eval_every:间隔多少步验证一次模型
8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
9 | * sequence_length:序列长度,单GPU时不要超过128
10 | * batch_size:单GPU时不要超过32
11 | * num_samples:文本分类的类别数量
12 | * train_n_tasks:每个epoch下采样的训练任务
13 | * eval_n_tasks:每次eval时采样的验证任务
14 | * margin:triplet loss中的间隔值,建议0.5-0.7之间
15 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
16 | * output_path:输出文件夹,用来存储label_to_index等文件
17 | * bert_model_path:预训练模型文件夹路径
18 | * data:数据路径
19 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 |
4 | * model_name:模型名称
5 | * epochs:迭代epoch的数量
6 | * checkpoint_every:间隔多少步保存一次模型
7 | * eval_every:间隔多少步验证一次模型
8 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
9 | * sequence_length:序列长度,单GPU时不要超过128
10 | * batch_size:单GPU时不要超过32
11 | * num_samples:文本分类的类别数量
12 | * train_n_tasks:每个epoch下采样的训练任务
13 | * eval_n_tasks:每次eval时采样的验证任务
14 | * margin:triplet loss中的间隔值,建议0.5-0.7之间
15 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
16 | * output_path:输出文件夹,用来存储label_to_index等文件
17 | * bert_model_path:预训练模型文件夹路径
18 | * data:数据路径
19 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/bert_task/ner_task/config/msraner_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "ner",
3 | "epochs": 5,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 2e-5,
7 | "sequence_length": 128,
8 | "ner_layers": [128],
9 | "ner_hidden_sizes": [128],
10 | "batch_size": 16,
11 | "num_classes": 7,
12 | "keep_prob": 0.9,
13 | "warmup_rate": 0.1,
14 | "output_path": "output",
15 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
16 | "train_data": "data/msraner/test.txt",
17 | "eval_data": "data/msraner/test.txt",
18 | "ckpt_model_path": "ckpt_model/"
19 | }
--------------------------------------------------------------------------------
/bert_task/bert/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/albert_task/ner_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### albert + bilstm + crf
4 | ##### 以inews_config.json为例
5 |
6 | * model_name:模型名称
7 | * epochs:迭代epoch的数量
8 | * checkpoint_every:间隔多少步保存一次模型
9 | * eval_every:间隔多少步验证一次模型
10 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
11 | * sequence_length:序列长度,单GPU时不要超过128
12 | * batch_size:单GPU时不要超过32
13 | * ner_layers:lstm中的隐层大小
14 | * ner_hidden_sizes:bilstm-ner中的全连接层中的隐层大小
15 | * keep_prob:bilstm-ner中全连接层中的dropout比例,该值等于1-dropout rate
16 | * num_classes:文本分类的类别数量
17 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
18 | * output_path:输出文件夹,用来存储label_to_index等文件
19 | * bert_model_path:预训练模型文件夹路径
20 | * train_data:训练数据路径
21 | * eval_data:验证数据路径
22 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/bert_task/ner_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### albert + bilstm + crf
4 | ##### 以inews_config.json为例
5 |
6 | * model_name:模型名称
7 | * epochs:迭代epoch的数量
8 | * checkpoint_every:间隔多少步保存一次模型
9 | * eval_every:间隔多少步验证一次模型
10 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
11 | * sequence_length:序列长度,单GPU时不要超过128
12 | * batch_size:单GPU时不要超过32
13 | * ner_layers:lstm中的隐层大小
14 | * ner_hidden_sizes:bilstm-ner中的全连接层中的隐层大小
15 | * keep_prob:bilstm-ner中全连接层中的dropout比例,该值等于1-dropout rate
16 | * num_classes:文本分类的类别数量
17 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
18 | * output_path:输出文件夹,用来存储label_to_index等文件
19 | * bert_model_path:预训练模型文件夹路径
20 | * train_data:训练数据路径
21 | * eval_data:验证数据路径
22 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/albert_task/machine_reading_task/config/cmrc_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "machine_reading",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 5e-5,
7 | "max_length": 512,
8 | "doc_stride": 128,
9 | "query_length": 64,
10 | "max_answer_length": 30,
11 | "n_best_size": 20,
12 | "batch_size": 32,
13 | "warmup_rate": 0.1,
14 | "output_path": "output/cmrc2018",
15 | "output_predictions_path": "output/cmrc2018/predictions.json",
16 | "output_nbest_path": "output/cmrc2018/nbset.json",
17 | "bert_model_path": "../albert_model/albert_tiny",
18 | "train_data": "data/cmrc2018/cmrc2018_train.json",
19 | "eval_data": "data/cmrc2018/cmrc2018_dev.json",
20 | "ckpt_model_path": "ckpt_model/cmrc2018"
21 | }
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/config/cmrc_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "machine_reading",
3 | "epochs": 10,
4 | "checkpoint_every": 1000,
5 | "eval_every": 1000,
6 | "learning_rate": 2e-5,
7 | "max_length": 512,
8 | "doc_stride": 128,
9 | "query_length": 64,
10 | "max_answer_length": 30,
11 | "n_best_size": 20,
12 | "batch_size": 8,
13 | "warmup_rate": 0.1,
14 | "output_path": "output/cmrc2018",
15 | "output_predictions_path": "output/cmrc2018/predictions.json",
16 | "output_nbest_path": "output/cmrc2018/nbset.json",
17 | "bert_model_path": "../bert_model/chinese_L-12_H-768_A-12",
18 | "train_data": "data/cmrc2018/cmrc2018_train.json",
19 | "eval_data": "data/cmrc2018/cmrc2018_dev.json",
20 | "ckpt_model_path": "ckpt_model/cmrc2018"
21 | }
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/README.md:
--------------------------------------------------------------------------------
1 | #### config文件解读
2 |
3 | ##### 以cmrc_config.json为例
4 |
5 | * model_name:模型名称
6 | * epochs:迭代epoch的数量
7 | * checkpoint_every:间隔多少步保存一次模型
8 | * eval_every:间隔多少步验证一次模型
9 | * learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4
10 | * max_length:输入到模型中的最大长度,建议设置为512
11 | * doc_stride:对于context长度较长的时候会分成多个doc,采用滑动窗口的形式分doc,这个是滑动窗口的大小,建议设为128
12 | * query_length:输入的问题的最大长度
13 | * max_answer_length:生成的回答的最大长度
14 | * n_best_size:获取分数最高的前n个
15 | * batch_size:单GPU时不要超过32
16 | * num_classes:文本分类的类别数量
17 | * warmup_rate:训练时的预热比例,建议0.05, 0.1
18 | * output_path:输出文件夹,用来存储label_to_index等文件
19 | * output_predictions_path:训练时在验证集上预测的最佳结果保存路径
20 | * output_nbest_path:训练时在验证集上预测的n个最佳结果的保存路径
21 | * bert_model_path:预训练模型文件夹路径
22 | * train_data:训练数据路径
23 | * eval_data:验证数据路径
24 | * ckpt_model_path:checkpoint模型文件保存路径
--------------------------------------------------------------------------------
/albert_task/albert/run.sh:
--------------------------------------------------------------------------------
1 | python run_classifier.py --task_name=lcqmc --do_train=true --do_eval=true --data_dir=../task_data/lcqmc --vocab_file=pre_trained_model/albert_tiny/vocab.txt --bert_config_file=pre_trained_model/albert_tiny/albert_config_tiny.json --max_seq_length=128 --train_batch_size=64 --learning_rate=1e-4 --num_train_epochs=5 --output_dir=output/lcqmc --init_checkpoint=pre_trained_model/albert_tiny/albert_model.ckpt
2 |
3 |
4 | python run_classifier.py --task_name=tnews --do_train=true --do_eval=true --data_dir=../task_data/tnews --vocab_file=pre_trained_model/albert_large/vocab.txt --bert_config_file=pre_trained_model/albert_large/albert_config_large.json --max_seq_length=128 --train_batch_size=8 --learning_rate=2e-5 --num_train_epochs=5 --output_dir=output/tnews --init_checkpoint=pre_trained_model/albert_large/albert_model.ckpt
5 |
--------------------------------------------------------------------------------
/albert_task/albert/args.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow as tf
3 |
4 | tf.logging.set_verbosity(tf.logging.INFO)
5 |
6 | file_path = os.path.dirname(__file__)
7 |
8 |
9 | #模型目录
10 | model_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/')
11 |
12 | #config文件
13 | config_name = os.path.join(file_path, 'albert_config/albert_config.json')
14 | #ckpt文件名称
15 | ckpt_name = os.path.join(model_dir, 'model.ckpt')
16 | #输出文件目录
17 | output_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/')
18 | #vocab文件目录
19 | vocab_file = os.path.join(file_path, 'albert_config/vocab.txt')
20 | #数据目录
21 | data_dir = os.path.join(file_path, 'data/')
22 |
23 | num_train_epochs = 10
24 | batch_size = 128
25 | learning_rate = 0.00005
26 |
27 | # gpu使用率
28 | gpu_memory_fraction = 0.8
29 |
30 | # 默认取倒数第二层的输出值作为句向量
31 | layer_indexes = [-2]
32 |
33 | # 序列的最大程度,单文本建议把该值调小
34 | max_seq_len = 128
35 |
36 | # graph名字
37 | graph_file = os.path.join(file_path, 'albert_lcqmc_checkpoints/graph')
--------------------------------------------------------------------------------
/albert_task/machine_reading_task/test.py:
--------------------------------------------------------------------------------
1 | import json
2 | from predict import Predictor
3 |
4 |
5 | with open("config/cmrc_config.json", "r") as fr:
6 | config = json.load(fr)
7 |
8 |
9 | predictor = Predictor(config)
10 | query = "锣鼓经运用的程式是什么?"
11 | context = "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的" \
12 | "演奏方法。常用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子" \
13 | "和尾声,提示音乐的板式和速度,以及作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定" \
14 | "俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果," \
15 | "如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单皮鼓(板鼓)、大鼓、" \
16 | "大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、" \
17 | "大钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责," \
18 | "又称为四大件,领奏的师傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不" \
19 | "同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、" \
20 | "昆剧和粤剧锣鼓中乐器对应的口诀用字:"
21 |
22 | answer = predictor.predict(query, context)
23 | print(answer)
--------------------------------------------------------------------------------
/bert_task/bert/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
4 | so we do not plan to make any major changes to this library (other than what was
5 | promised in the README). However, we can accept small patches related to
6 | re-factoring and documentation. To submit contributes, there are just a few
7 | small guidelines you need to follow.
8 |
9 | ## Contributor License Agreement
10 |
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to to see
15 | your current agreements on file or to sign a new one.
16 |
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 |
21 | ## Code reviews
22 |
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 |
28 | ## Community Guidelines
29 |
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### BERT和ALBERT在下游任务中的应用
2 | #### 本项目提供了易用的训练模式和预测模式,可以直接部署。也容易扩展到任何下游任务中
3 |
4 | #### albert_task和bert_task文件夹中的内容基本一致
5 | * albert_task/albert是albert的源码
6 | * albert_task/albert_model中包含了四种albert的模型:albert_tiny, albert_base, albert_large, albert_xlarge
7 | * bert_task/bert是bert的源码
8 | * bert_task/bert_model中包含了中文bert模型
9 | * 需要下载albert的预训练模型放置在albert_task下,bert的预训练模型放置在bert_task下
10 | * 预训练模型的路径可以在xxx_config.json文件中配置
11 |
12 | #### 目前提供了5大类的任务,classifier,sentence pair,ner,learning to rank(pair wise),machine reading。基准数据集来自chineseGLUE
13 | * classifier包括tnews,inews,thucnews
14 | * sentence pair包括bq,lcqmc,xnli
15 | * ner包括msraner
16 | * learning to rank(pair wise)是biendata上 **基于Adversarial Attack的问题等价性判别比赛**
17 | * machine reading包括cmrc2018
18 |
19 | #### 每个任务下的结构基本一致
20 | * config:放置每个具体任务的配置文件,包括训练参数,数据路径,模型存储路径
21 | * data_helper.py:数据预处理文件
22 | * metrics.py:性能指标文件
23 | * model.py:模型文件,可以很容易的实现bert和下游网络层的结合
24 | * trainer.py:训练模型
25 | * predict.py:预测代码,只需要实例化Predictor类,调用predict方法就可以预测
26 |
27 | #### 训练数据格式
28 | ##### 文本分类数据格式
29 | * title \ content \ label:有的数据中含有标题,有的只有正文,标题,正文,标签之间用\符号分隔。
30 | ##### 句子对数据格式
31 | * sentence A\sentence B\label:同样对于两个句子和标签采用\符号分隔。
32 | ##### ner数据格式
33 | ###### 我们采用了BIO的格式标注,也可以采用BIOS, BIEO, BIEOS标注,将输入中的词和标注都用\t分隔。
34 | * 慕 名 前 来 品 尝 玉 峰 茶 , 领 略 茶 文 化 的 人 越 来 越 多 。\o o o o o o B-ns I-ns o o o o o o o o o o o o o o
35 | ##### 阅读理解数据格式
36 | * context:抽取式阅读理解的上下文
37 | * question:问题
38 | * answer:答案,从context中抽取一个片段
39 | * start_position: answer的起始位置
40 | * end_position: answer的终止位置
41 | ##### learning_to_rank
42 | * point wise:随机采样正样本对和负样本对组成一个样本对的分类问题,和句子对数据格式一致。
43 | * pair wise:给定一个query,抽取一个和query相似的正样本,抽取若干个和query不相似的负样本。
44 | #### 训练模型
45 | * 执行每个任务下的sh脚本即可,sh run.sh。只需要更改配置文件就可以训练不同的模型
46 |
47 | #### 预测
48 | * 执行albert_task中每个任务下的test.py文件就可以预测,bert_task同albert_task。
--------------------------------------------------------------------------------
/bert_task/bert/optimization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import optimization
20 | import tensorflow as tf
21 |
22 |
23 | class OptimizationTest(tf.test.TestCase):
24 |
25 | def test_adam(self):
26 | with self.test_session() as sess:
27 | w = tf.get_variable(
28 | "w",
29 | shape=[3],
30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 | x = tf.constant([0.4, 0.2, -0.5])
32 | loss = tf.reduce_mean(tf.square(x - w))
33 | tvars = tf.trainable_variables()
34 | grads = tf.gradients(loss, tvars)
35 | global_step = tf.train.get_or_create_global_step()
36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 | init_op = tf.group(tf.global_variables_initializer(),
39 | tf.local_variables_initializer())
40 | sess.run(init_op)
41 | for _ in range(100):
42 | sess.run(train_op)
43 | w_np = sess.run(w)
44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 |
46 |
47 | if __name__ == "__main__":
48 | tf.test.main()
49 |
--------------------------------------------------------------------------------
/bert_task/bert/.gitignore:
--------------------------------------------------------------------------------
1 | # Initially taken from Github's Python gitignore file
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 | env.bak/
98 | venv.bak/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 |
115 | # Pyre type checker
116 | .pyre/
117 |
--------------------------------------------------------------------------------
/albert_task/albert/test_changes.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import tensorflow as tf
3 | from modeling import embedding_lookup_factorized,transformer_model
4 | import os
5 |
6 | """
7 | 测试albert主要的改进点:词嵌入的因式分解、层间参数共享、段落间连贯性
8 | test main change of albert from bert
9 | """
10 | batch_size = 2048
11 | sequence_length = 512
12 | vocab_size = 30000
13 | hidden_size = 1024
14 | num_attention_heads = int(hidden_size / 64)
15 |
16 | def get_total_parameters():
17 | """
18 | get total parameters of a graph
19 | :return:
20 | """
21 | total_parameters = 0
22 | for variable in tf.trainable_variables():
23 | # shape is an array of tf.Dimension
24 | shape = variable.get_shape()
25 | # print(shape)
26 | # print(len(shape))
27 | variable_parameters = 1
28 | for dim in shape:
29 | # print(dim)
30 | variable_parameters *= dim.value
31 | # print(variable_parameters)
32 | total_parameters += variable_parameters
33 | return total_parameters
34 |
35 | def test_factorized_embedding():
36 | """
37 | test of Factorized embedding parameterization
38 | :return:
39 | """
40 | input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32)
41 | output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size)
42 | print("output:",output)
43 |
44 | def test_share_parameters():
45 | """
46 | test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
47 | :return:
48 | """
49 | def total_parameters_transformer(share_parameter_across_layers):
50 | input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32)
51 | print("transformer_model. input:",input_tensor)
52 | transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers)
53 | print("transformer_result:",transformer_result)
54 | total_parameters=get_total_parameters()
55 | print('total_parameters(not share):',total_parameters)
56 |
57 | share_parameter_across_layers=False
58 | total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million
59 |
60 | tf.reset_default_graph() # Clears the default graph stack and resets the global default graph
61 | share_parameter_across_layers=True
62 | total_parameters_transformer(share_parameter_across_layers) # total parameters, share: 10,498,048 = 10.5 million
63 |
64 | def test_sentence_order_prediction():
65 | """
66 | sentence order prediction.
67 |
68 | check method of create_instances_from_document_albert from create_pretrining_data.py
69 |
70 | :return:
71 | """
72 | # 添加运行权限
73 | os.system("chmod +x create_pretrain_data.sh")
74 |
75 | os.system("./create_pretrain_data.sh")
76 |
77 |
78 | # 1.test of Factorized embedding parameterization
79 | #test_factorized_embedding()
80 |
81 | # 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
82 | # before share parameter: 125,976,576; after share parameter:
83 | #test_share_parameters()
84 |
85 | # 3. test of sentence order prediction(SOP)
86 | test_sentence_order_prediction()
87 |
88 |
--------------------------------------------------------------------------------
/bert_task/ner_task/metrics.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | 定义性能指标函数
4 | """
5 |
6 |
7 | def mean(item):
8 | return sum(item) / len(item)
9 |
10 |
11 | def get_chunk_type(index, index_to_label):
12 | """
13 | 对实体的标签进行分割,返回实体的位置和实体的名称
14 | """
15 | label_name = index_to_label[index]
16 | label_class, label_type = label_name.split("-")
17 |
18 | return label_name, label_class, label_type
19 |
20 |
21 | def get_chunk(sequence, label_to_index):
22 | """
23 | 给定一个标注序列,将实体和位置组合起来,放置在一个列表中
24 | """
25 | unentry = [label_to_index["o"]]
26 | index_to_label = {index: label for label, index in label_to_index.items()}
27 | chunks = []
28 | chunk_type, chunk_start = None, None
29 | for index, label in enumerate(sequence):
30 | if label in unentry:
31 | # 如果非实体词
32 | if chunk_type is None:
33 | # 若chunk_type为None,表明上一个词是非实体,继续跳过
34 | continue
35 | else:
36 | # 若chunkType非None,则上面的是一个实体,而当前非实体,则将上一个实体chunk加入到chunks中
37 | # 主要为序列中的这种情况,O,B-PER,I-PER,O 这也是最常见的情况
38 | chunk = (chunk_type, chunk_start, index-1)
39 | chunks.append(chunk)
40 | chunk_type, chunk_start = None, None
41 |
42 | if label not in unentry:
43 | # 如果是实体词,在这里的label是索引表示的label
44 | label_name, label_chunk_class, label_chunk_type = get_chunk_type(label, index_to_label)
45 | if chunk_type is None:
46 | # 若当前chunk_type为None,则表明上一个词是非实体词
47 | chunk_type, chunk_start = label_chunk_type, index
48 | elif label_chunk_type == chunk_type:
49 | # 若实体类型和上一个相同,则做如下判断
50 | if index == (len(sequence) - 1):
51 | # 若当前词是序列中的最后一个词,则直接返回chunk
52 | chunk = (chunk_type, chunk_start, index)
53 | chunks.append(chunk)
54 |
55 | # 若出现两个相同的实体连在一块,则做如下操作
56 | elif label_chunk_class == "B":
57 | chunk = (chunk_type, chunk_start, index - 1)
58 | chunks.append(chunk)
59 | chunk_type, chunk_start = label_chunk_type, index
60 | else:
61 | # 若当前非最后一个词,则跳过
62 | continue
63 | elif label_chunk_type != chunk_type:
64 | # 若当前词和上一个词类型不同,则将上一个实体chunk加入到chunks中,接着继续下一个chunk
65 | # 主要体现在两个实体相连的序列中,如B-PER,I-PER,B-LOC,I-LOC
66 | chunk = (chunk_type, chunk_start, index-1)
67 | chunks.append(chunk)
68 | chunk_type, chunk_start = label_chunk_type, index
69 |
70 | return chunks
71 |
72 |
73 | def gen_metrics(true_y, pred_y, label_to_index):
74 | """
75 | 生成f1值,recall, precision
76 | precision = 识别的正确实体数/识别出的实体数
77 | recall = 识别的正确实体数/样本的实体数
78 | """
79 | correct_preds = 0 # 识别出的正确实体数
80 | all_preds = 0 # 识别出的实体数
81 | all_trues = 0 # 样本的真实实体数
82 |
83 | true_chunks = get_chunk(true_y.tolist(), label_to_index)
84 | pred_chunks = get_chunk(pred_y.tolist(), label_to_index)
85 | correct_preds += len(set(true_chunks) & set(pred_chunks))
86 | all_preds += len(pred_chunks)
87 | all_trues += len(true_chunks)
88 |
89 | precision = correct_preds / all_preds if correct_preds > 0 else 0
90 | recall = correct_preds / all_trues if correct_preds > 0 else 0
91 | f1 = 2 * precision * recall / (precision + recall) if correct_preds > 0 else 0
92 |
93 | return round(f1, 4), round(precision, 4), round(recall, 4)
--------------------------------------------------------------------------------
/albert_task/ner_task/metrics.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | 定义性能指标函数
4 | """
5 |
6 |
7 | def mean(item):
8 | return sum(item) / len(item)
9 |
10 |
11 | def get_chunk_type(index, index_to_label):
12 | """
13 | 对实体的标签进行分割,返回实体的位置和实体的名称
14 | """
15 | label_name = index_to_label[index]
16 | label_class, label_type = label_name.split("-")
17 |
18 | return label_name, label_class, label_type
19 |
20 |
21 | def get_chunk(sequence, label_to_index):
22 | """
23 | 给定一个标注序列,将实体和位置组合起来,放置在一个列表中
24 | """
25 | unentry = [label_to_index["o"]]
26 | index_to_label = {index: label for label, index in label_to_index.items()}
27 | chunks = []
28 | chunk_type, chunk_start = None, None
29 | for index, label in enumerate(sequence):
30 | if label in unentry:
31 | # 如果非实体词
32 | if chunk_type is None:
33 | # 若chunk_type为None,表明上一个词是非实体,继续跳过
34 | continue
35 | else:
36 | # 若chunkType非None,则上面的是一个实体,而当前非实体,则将上一个实体chunk加入到chunks中
37 | # 主要为序列中的这种情况,O,B-PER,I-PER,O 这也是最常见的情况
38 | chunk = (chunk_type, chunk_start, index-1)
39 | chunks.append(chunk)
40 | chunk_type, chunk_start = None, None
41 |
42 | if label not in unentry:
43 | # 如果是实体词,在这里的label是索引表示的label
44 | label_name, label_chunk_class, label_chunk_type = get_chunk_type(label, index_to_label)
45 | if chunk_type is None:
46 | # 若当前chunk_type为None,则表明上一个词是非实体词
47 | chunk_type, chunk_start = label_chunk_type, index
48 | elif label_chunk_type == chunk_type:
49 | # 若实体类型和上一个相同,则做如下判断
50 | if index == (len(sequence) - 1):
51 | # 若当前词是序列中的最后一个词,则直接返回chunk
52 | chunk = (chunk_type, chunk_start, index)
53 | chunks.append(chunk)
54 |
55 | # 若出现两个相同的实体连在一块,则做如下操作
56 | elif label_chunk_class == "B":
57 | chunk = (chunk_type, chunk_start, index - 1)
58 | chunks.append(chunk)
59 | chunk_type, chunk_start = label_chunk_type, index
60 | else:
61 | # 若当前非最后一个词,则跳过
62 | continue
63 | elif label_chunk_type != chunk_type:
64 | # 若当前词和上一个词类型不同,则将上一个实体chunk加入到chunks中,接着继续下一个chunk
65 | # 主要体现在两个实体相连的序列中,如B-PER,I-PER,B-LOC,I-LOC
66 | chunk = (chunk_type, chunk_start, index-1)
67 | chunks.append(chunk)
68 | chunk_type, chunk_start = label_chunk_type, index
69 |
70 | return chunks
71 |
72 |
73 | def gen_metrics(true_y, pred_y, label_to_index):
74 | """
75 | 生成f1值,recall, precision
76 | precision = 识别的正确实体数/识别出的实体数
77 | recall = 识别的正确实体数/样本的实体数
78 | """
79 | correct_preds = 0 # 识别出的正确实体数
80 | all_preds = 0 # 识别出的实体数
81 | all_trues = 0 # 样本的真实实体数
82 |
83 | true_chunks = get_chunk(true_y.tolist(), label_to_index)
84 | pred_chunks = get_chunk(pred_y.tolist(), label_to_index)
85 | correct_preds += len(set(true_chunks) & set(pred_chunks))
86 | all_preds += len(pred_chunks)
87 | all_trues += len(true_chunks)
88 |
89 | precision = correct_preds / all_preds if correct_preds > 0 else 0
90 | recall = correct_preds / all_trues if correct_preds > 0 else 0
91 | f1 = 2 * precision * recall / (precision + recall) if correct_preds > 0 else 0
92 |
93 | return round(f1, 4), round(precision, 4), round(recall, 4)
--------------------------------------------------------------------------------
/bert_task/classifier_task/predict.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
5 |
6 | import tensorflow as tf
7 | from model import BertClassifier
8 | from bert import tokenization
9 |
10 |
11 | class Predictor(object):
12 | def __init__(self, config):
13 | self.model = None
14 | self.config = config
15 |
16 | self.output_path = config["output_path"]
17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
18 | self.label_to_index = self.load_vocab()
19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()}
20 | self.word_vectors = None
21 | self.sequence_length = self.config["sequence_length"]
22 |
23 | # 创建模型
24 | self.create_model()
25 | # 加载计算图
26 | self.load_graph()
27 |
28 | def load_vocab(self):
29 | # 将词汇-索引映射表加载出来
30 |
31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
32 | label_to_index = json.load(f)
33 |
34 | return label_to_index
35 |
36 | def padding(self, input_id, input_mask, segment_id):
37 | """
38 | 对序列进行补全
39 | :param input_id:
40 | :param input_mask:
41 | :param segment_id:
42 | :return:
43 | """
44 |
45 | if len(input_id) < self.sequence_length:
46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
49 | else:
50 | pad_input_id = input_id[:self.sequence_length]
51 | pad_input_mask = input_mask[:self.sequence_length]
52 | pad_segment_id = segment_id[:self.sequence_length]
53 |
54 | return pad_input_id, pad_input_mask, pad_segment_id
55 |
56 | def sentence_to_idx(self, text):
57 | """
58 | 将分词后的句子转换成idx表示
59 | :return:
60 | """
61 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
62 |
63 | text = tokenization.convert_to_unicode(text)
64 | tokens = tokenizer.tokenize(text)
65 | tokens = ["[CLS]"] + tokens + ["[SEP]"]
66 | input_id = tokenizer.convert_tokens_to_ids(tokens)
67 | input_mask = [1] * len(input_id)
68 | segment_id = [0] * len(input_id)
69 |
70 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id)
71 |
72 | return [input_id], [input_mask], [segment_id]
73 |
74 | def load_graph(self):
75 | """
76 | 加载计算图
77 | :return:
78 | """
79 | self.sess = tf.Session()
80 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
81 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
82 | print('Reloading model parameters..')
83 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
84 | else:
85 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
86 |
87 | def create_model(self):
88 | """
89 | 根据config文件选择对应的模型,并初始化
90 | :return:
91 | """
92 | self.model = BertClassifier(config=self.config, is_training=False)
93 |
94 | def predict(self, text):
95 | """
96 | 给定分词后的句子,预测其分类结果
97 | :param text:
98 | :return:
99 | """
100 | input_ids, input_masks, segment_ids = self.sentence_to_idx(text)
101 |
102 | prediction = self.model.infer(self.sess,
103 | dict(input_ids=input_ids,
104 | input_masks=input_masks,
105 | segment_ids=segment_ids)).tolist()[0]
106 | label = self.index_to_label[prediction]
107 | return label
108 |
109 |
110 |
--------------------------------------------------------------------------------
/albert_task/classifier_task/predict.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
5 |
6 | import tensorflow as tf
7 | from model import AlbertClassifier
8 | from albert import tokenization
9 |
10 |
11 | class Predictor(object):
12 | def __init__(self, config):
13 | self.model = None
14 | self.config = config
15 |
16 | self.output_path = config["output_path"]
17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
18 | self.label_to_index = self.load_vocab()
19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()}
20 | self.word_vectors = None
21 | self.sequence_length = self.config["sequence_length"]
22 |
23 | # 创建模型
24 | self.create_model()
25 | # 加载计算图
26 | self.load_graph()
27 |
28 | def load_vocab(self):
29 | # 将词汇-索引映射表加载出来
30 |
31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
32 | label_to_index = json.load(f)
33 |
34 | return label_to_index
35 |
36 | def padding(self, input_id, input_mask, segment_id):
37 | """
38 | 对序列进行补全
39 | :param input_id:
40 | :param input_mask:
41 | :param segment_id:
42 | :return:
43 | """
44 |
45 | if len(input_id) < self.sequence_length:
46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
49 | else:
50 | pad_input_id = input_id[:self.sequence_length]
51 | pad_input_mask = input_mask[:self.sequence_length]
52 | pad_segment_id = segment_id[:self.sequence_length]
53 |
54 | return pad_input_id, pad_input_mask, pad_segment_id
55 |
56 | def sentence_to_idx(self, text):
57 | """
58 | 将分词后的句子转换成idx表示
59 | :return:
60 | """
61 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
62 |
63 | text = tokenization.convert_to_unicode(text)
64 | tokens = tokenizer.tokenize(text)
65 | tokens = ["[CLS]"] + tokens + ["[SEP]"]
66 | input_id = tokenizer.convert_tokens_to_ids(tokens)
67 | input_mask = [1] * len(input_id)
68 | segment_id = [0] * len(input_id)
69 |
70 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id)
71 |
72 | return [input_id], [input_mask], [segment_id]
73 |
74 | def load_graph(self):
75 | """
76 | 加载计算图
77 | :return:
78 | """
79 | self.sess = tf.Session()
80 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
81 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
82 | print('Reloading model parameters..')
83 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
84 | else:
85 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
86 |
87 | def create_model(self):
88 | """
89 | 根据config文件选择对应的模型,并初始化
90 | :return:
91 | """
92 | self.model = AlbertClassifier(config=self.config, is_training=False)
93 |
94 | def predict(self, text):
95 | """
96 | 给定分词后的句子,预测其分类结果
97 | :param text:
98 | :return:
99 | """
100 | input_ids, input_masks, segment_ids = self.sentence_to_idx(text)
101 |
102 | prediction = self.model.infer(self.sess,
103 | dict(input_ids=input_ids,
104 | input_masks=input_masks,
105 | segment_ids=segment_ids)).tolist()[0]
106 | label = self.index_to_label[prediction]
107 | return label
108 |
109 |
110 |
--------------------------------------------------------------------------------
/bert_task/ner_task/predict.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
5 |
6 | import tensorflow as tf
7 | from model import BertNer
8 | from bert import tokenization
9 | from metrics import get_chunk
10 |
11 |
12 | class Predictor(object):
13 | def __init__(self, config):
14 | self.model = None
15 | self.config = config
16 |
17 | self.output_path = config["output_path"]
18 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
19 | self.label_to_index = self.load_vocab()
20 | self.word_vectors = None
21 | self.sequence_length = self.config["sequence_length"]
22 |
23 | # 创建模型
24 | self.create_model()
25 | # 加载计算图
26 | self.load_graph()
27 |
28 | def load_vocab(self):
29 | # 将词汇-索引映射表加载出来
30 |
31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
32 | label_to_index = json.load(f)
33 |
34 | return label_to_index
35 |
36 | def padding(self, input_id, input_mask, segment_id):
37 | """
38 | 对序列进行补全
39 | :param input_id:
40 | :param input_mask:
41 | :param segment_id:
42 | :return:
43 | """
44 |
45 | if len(input_id) < self.sequence_length:
46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
49 | sequence_len = len(input_id)
50 | else:
51 | pad_input_id = input_id[:self.sequence_length]
52 | pad_input_mask = input_mask[:self.sequence_length]
53 | pad_segment_id = segment_id[:self.sequence_length]
54 | sequence_len = self.sequence_length
55 |
56 | return pad_input_id, pad_input_mask, pad_segment_id, sequence_len
57 |
58 | def sentence_to_idx(self, text):
59 | """
60 | 将分词后的句子转换成idx表示
61 | :return:
62 | """
63 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
64 |
65 | tokens = []
66 | for token in text:
67 | token = tokenizer.tokenize(token)
68 | tokens.extend(token)
69 |
70 | tokens = ["[CLS]"] + tokens + ["[SEP]"]
71 | input_id = tokenizer.convert_tokens_to_ids(tokens)
72 |
73 | input_mask = [1] * len(input_id)
74 | segment_id = [0] * len(input_id)
75 |
76 | input_id, input_mask, segment_id, sequence_len = self.padding(input_id, input_mask, segment_id)
77 |
78 | return [input_id], [input_mask], [segment_id], [sequence_len]
79 |
80 | def load_graph(self):
81 | """
82 | 加载计算图
83 | :return:
84 | """
85 | self.sess = tf.Session()
86 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
87 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
88 | print('Reloading model parameters..')
89 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
90 | else:
91 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
92 |
93 | def create_model(self):
94 | """
95 | 根据config文件选择对应的模型,并初始化
96 | :return:
97 | """
98 | self.model = BertNer(config=self.config, is_training=False)
99 |
100 | def predict(self, text):
101 | """
102 | 给定分词后的句子,预测其分类结果
103 | :param text:
104 | :return:
105 | """
106 | input_ids, input_masks, segment_ids, sequence_len = self.sentence_to_idx(text)
107 |
108 | prediction = self.model.infer(self.sess,
109 | dict(input_ids=input_ids,
110 | input_masks=input_masks,
111 | segment_ids=segment_ids,
112 | sequence_len=sequence_len)).tolist()
113 | print(prediction)
114 | chunks = get_chunk(prediction, self.label_to_index)
115 | return chunks
116 |
117 |
118 |
--------------------------------------------------------------------------------
/albert_task/ner_task/predict.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
5 |
6 | import tensorflow as tf
7 | from model import ALBertNer
8 | from albert import tokenization
9 | from metrics import get_chunk
10 |
11 |
12 | class Predictor(object):
13 | def __init__(self, config):
14 | self.model = None
15 | self.config = config
16 |
17 | self.output_path = config["output_path"]
18 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
19 | self.label_to_index = self.load_vocab()
20 | self.word_vectors = None
21 | self.sequence_length = self.config["sequence_length"]
22 |
23 | # 创建模型
24 | self.create_model()
25 | # 加载计算图
26 | self.load_graph()
27 |
28 | def load_vocab(self):
29 | # 将词汇-索引映射表加载出来
30 |
31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
32 | label_to_index = json.load(f)
33 |
34 | return label_to_index
35 |
36 | def padding(self, input_id, input_mask, segment_id):
37 | """
38 | 对序列进行补全
39 | :param input_id:
40 | :param input_mask:
41 | :param segment_id:
42 | :return:
43 | """
44 |
45 | if len(input_id) < self.sequence_length:
46 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
47 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
48 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
49 | sequence_len = len(input_id)
50 | else:
51 | pad_input_id = input_id[:self.sequence_length]
52 | pad_input_mask = input_mask[:self.sequence_length]
53 | pad_segment_id = segment_id[:self.sequence_length]
54 | sequence_len = self.sequence_length
55 |
56 | return pad_input_id, pad_input_mask, pad_segment_id, sequence_len
57 |
58 | def sentence_to_idx(self, text):
59 | """
60 | 将分词后的句子转换成idx表示
61 | :return:
62 | """
63 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
64 |
65 | tokens = []
66 | for token in text:
67 | token = tokenizer.tokenize(token)
68 | tokens.extend(token)
69 |
70 | tokens = ["[CLS]"] + tokens + ["[SEP]"]
71 | input_id = tokenizer.convert_tokens_to_ids(tokens)
72 |
73 | input_mask = [1] * len(input_id)
74 | segment_id = [0] * len(input_id)
75 |
76 | input_id, input_mask, segment_id, sequence_len = self.padding(input_id, input_mask, segment_id)
77 |
78 | return [input_id], [input_mask], [segment_id], [sequence_len]
79 |
80 | def load_graph(self):
81 | """
82 | 加载计算图
83 | :return:
84 | """
85 | self.sess = tf.Session()
86 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
87 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
88 | print('Reloading model parameters..')
89 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
90 | else:
91 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
92 |
93 | def create_model(self):
94 | """
95 | 根据config文件选择对应的模型,并初始化
96 | :return:
97 | """
98 | self.model = ALBertNer(config=self.config, is_training=False)
99 |
100 | def predict(self, text):
101 | """
102 | 给定分词后的句子,预测其分类结果
103 | :param text:
104 | :return:
105 | """
106 | input_ids, input_masks, segment_ids, sequence_len = self.sentence_to_idx(text)
107 |
108 | prediction = self.model.infer(self.sess,
109 | dict(input_ids=input_ids,
110 | input_masks=input_masks,
111 | segment_ids=segment_ids,
112 | sequence_len=sequence_len)).tolist()
113 | print(prediction)
114 | chunks = get_chunk(prediction, self.label_to_index)
115 | return chunks
116 |
117 |
118 |
--------------------------------------------------------------------------------
/bert_task/bert/sample_text.txt:
--------------------------------------------------------------------------------
1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
2 | Text should be one-sentence-per-line, with empty lines between documents.
3 | This sample text is public domain and was randomly selected from Project Guttenberg.
4 |
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 |
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 |
--------------------------------------------------------------------------------
/bert_task/classifier_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.dirname(os.getcwd()))
4 | import tensorflow as tf
5 |
6 | from bert import modeling
7 | from bert import optimization
8 |
9 |
10 | class BertClassifier(object):
11 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
12 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json")
13 | self.__num_classes = config["num_classes"]
14 | self.__learning_rate = config["learning_rate"]
15 | self.__is_training = is_training
16 | self.__num_train_step = num_train_step
17 | self.__num_warmup_step = num_warmup_step
18 |
19 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids')
20 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask')
21 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids')
22 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids")
23 |
24 | self.built_model()
25 | self.init_saver()
26 |
27 | def built_model(self):
28 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
29 |
30 | model = modeling.BertModel(config=bert_config,
31 | is_training=self.__is_training,
32 | input_ids=self.input_ids,
33 | input_mask=self.input_masks,
34 | token_type_ids=self.segment_ids,
35 | use_one_hot_embeddings=False)
36 | output_layer = model.get_pooled_output()
37 |
38 | hidden_size = output_layer.shape[-1].value
39 | if self.__is_training:
40 | # I.e., 0.1 dropout
41 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
42 |
43 | with tf.name_scope("output"):
44 | output_weights = tf.get_variable(
45 | "output_weights", [self.__num_classes, hidden_size],
46 | initializer=tf.truncated_normal_initializer(stddev=0.02))
47 |
48 | output_bias = tf.get_variable(
49 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer())
50 |
51 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
52 | logits = tf.nn.bias_add(logits, output_bias)
53 | self.predictions = tf.argmax(logits, axis=-1, name="predictions")
54 |
55 | if self.__is_training:
56 |
57 | with tf.name_scope("loss"):
58 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids)
59 | self.loss = tf.reduce_mean(losses, name="loss")
60 |
61 | with tf.name_scope('train_op'):
62 | self.train_op = optimization.create_optimizer(
63 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
64 |
65 | def init_saver(self):
66 | self.saver = tf.train.Saver(tf.global_variables())
67 |
68 | def train(self, sess, batch):
69 | """
70 | 训练模型
71 | :param sess: tf的会话对象
72 | :param batch: batch数据
73 | :return: 损失和预测结果
74 | """
75 |
76 | feed_dict = {self.input_ids: batch["input_ids"],
77 | self.input_masks: batch["input_masks"],
78 | self.segment_ids: batch["segment_ids"],
79 | self.label_ids: batch["label_ids"]}
80 |
81 | # 训练模型
82 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict)
83 | return loss, predictions
84 |
85 | def eval(self, sess, batch):
86 | """
87 | 验证模型
88 | :param sess: tf中的会话对象
89 | :param batch: batch数据
90 | :return: 损失和预测结果
91 | """
92 | feed_dict = {self.input_ids: batch["input_ids"],
93 | self.input_masks: batch["input_masks"],
94 | self.segment_ids: batch["segment_ids"],
95 | self.label_ids: batch["label_ids"]}
96 |
97 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict)
98 | return loss, predictions
99 |
100 | def infer(self, sess, batch):
101 | """
102 | 预测新数据
103 | :param sess: tf中的会话对象
104 | :param batch: batch数据
105 | :return: 预测结果
106 | """
107 | feed_dict = {self.input_ids: batch["input_ids"],
108 | self.input_masks: batch["input_masks"],
109 | self.segment_ids: batch["segment_ids"]}
110 |
111 | predict = sess.run(self.predictions, feed_dict=feed_dict)
112 |
113 | return predict
114 |
--------------------------------------------------------------------------------
/bert_task/ltr_pair_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import random
4 | import argparse
5 | import sys
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 | import tensorflow as tf
9 | from bert import modeling
10 | from model import BertPairLTR
11 | from data_helper import TrainData
12 | from metrics import mean, accuracy
13 |
14 |
15 | class Trainer(object):
16 | def __init__(self, args):
17 | self.args = args
18 | with open(args.config_path, "r") as fr:
19 | self.config = json.load(fr)
20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt")
21 |
22 | # 加载数据集
23 | self.data_obj = self.load_data()
24 | self.queries = self.data_obj.gen_data(self.config["data"])
25 |
26 | print("train data size: {}".format(len(self.queries)))
27 |
28 | num_train_steps = int(self.config["train_n_tasks"] / self.config["batch_size"] * self.config["epochs"])
29 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
30 | # 初始化模型对象
31 | self.model = self.create_model(num_train_steps, num_warmup_steps)
32 |
33 | def load_data(self):
34 | """
35 | 创建数据对象
36 | :return:
37 | """
38 | # 生成训练集对象并生成训练数据
39 | data_obj = TrainData(self.config)
40 | return data_obj
41 |
42 | def create_model(self, num_train_step, num_warmup_step):
43 | """
44 | 根据config文件选择对应的模型,并初始化
45 | :return:
46 | """
47 | model = BertPairLTR(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
48 | return model
49 |
50 | def train(self):
51 | with tf.Session() as sess:
52 | tvars = tf.trainable_variables()
53 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
54 | tvars, self.__bert_checkpoint_path)
55 | print("init bert model params")
56 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
57 | print("init bert model params done")
58 | sess.run(tf.variables_initializer(tf.global_variables()))
59 |
60 | current_step = 0
61 |
62 | for epoch in range(self.config["epochs"]):
63 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
64 | t_in_ids_a, t_in_masks_a, t_seg_ids_a, t_in_ids_b, t_in_masks_b, t_seg_ids_b = \
65 | self.data_obj.gen_task_samples(self.queries, self.config["train_n_tasks"])
66 |
67 | for batch in self.data_obj.next_batch(t_in_ids_a, t_in_masks_a, t_seg_ids_a,
68 | t_in_ids_b, t_in_masks_b, t_seg_ids_b):
69 | loss, predictions = self.model.train(sess, batch)
70 | acc = accuracy(predictions)
71 | print("train: step: {}, loss: {}, acc: {}".format(current_step, loss, acc))
72 |
73 | current_step += 1
74 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
75 | e_in_ids_a, e_in_masks_a, e_seg_ids_a, e_in_ids_b, e_in_masks_b, e_seg_ids_b = \
76 | self.data_obj.gen_task_samples(self.queries, self.config["eval_n_tasks"])
77 | eval_losses = []
78 | eval_accs = []
79 |
80 | for eval_batch in self.data_obj.next_batch(e_in_ids_a, e_in_masks_a, e_seg_ids_a,
81 | e_in_ids_b, e_in_masks_b, e_seg_ids_b):
82 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch)
83 |
84 | eval_losses.append(eval_loss)
85 |
86 | acc = accuracy(eval_predictions)
87 | eval_accs.append(acc)
88 |
89 | print("\n")
90 | print("eval: loss: {}, acc: {}".format(mean(eval_losses), mean(eval_accs)))
91 | print("\n")
92 |
93 | if self.config["ckpt_model_path"]:
94 | save_path = self.config["ckpt_model_path"]
95 | if not os.path.exists(save_path):
96 | os.makedirs(save_path)
97 | model_save_path = os.path.join(save_path, self.config["model_name"])
98 | self.model.saver.save(sess, model_save_path, global_step=current_step)
99 |
100 |
101 | if __name__ == "__main__":
102 | # 读取用户在命令行输入的信息
103 | parser = argparse.ArgumentParser()
104 | parser.add_argument("--config_path", help="config path of model")
105 | args = parser.parse_args()
106 | trainer = Trainer(args)
107 | trainer.train()
108 |
--------------------------------------------------------------------------------
/albert_task/ltr_pair_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import random
4 | import argparse
5 | import sys
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 | import tensorflow as tf
9 | from albert import modeling
10 | from model import ALBertPairLTR
11 | from data_helper import TrainData
12 | from metrics import mean, accuracy
13 |
14 |
15 | class Trainer(object):
16 | def __init__(self, args):
17 | self.args = args
18 | with open(args.config_path, "r") as fr:
19 | self.config = json.load(fr)
20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt")
21 |
22 | # 加载数据集
23 | self.data_obj = self.load_data()
24 | self.queries = self.data_obj.gen_data(self.config["data"])
25 |
26 | print("train data size: {}".format(len(self.queries)))
27 |
28 | num_train_steps = int(self.config["train_n_tasks"] / self.config["batch_size"] * self.config["epochs"])
29 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
30 | # 初始化模型对象
31 | self.model = self.create_model(num_train_steps, num_warmup_steps)
32 |
33 | def load_data(self):
34 | """
35 | 创建数据对象
36 | :return:
37 | """
38 | # 生成训练集对象并生成训练数据
39 | data_obj = TrainData(self.config)
40 | return data_obj
41 |
42 | def create_model(self, num_train_step, num_warmup_step):
43 | """
44 | 根据config文件选择对应的模型,并初始化
45 | :return:
46 | """
47 | model = ALBertPairLTR(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
48 | return model
49 |
50 | def train(self):
51 | with tf.Session() as sess:
52 | tvars = tf.trainable_variables()
53 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
54 | tvars, self.__bert_checkpoint_path)
55 | print("init bert model params")
56 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
57 | print("init bert model params done")
58 | sess.run(tf.variables_initializer(tf.global_variables()))
59 |
60 | current_step = 0
61 |
62 | for epoch in range(self.config["epochs"]):
63 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
64 | t_in_ids_a, t_in_masks_a, t_seg_ids_a, t_in_ids_b, t_in_masks_b, t_seg_ids_b = \
65 | self.data_obj.gen_task_samples(self.queries, self.config["train_n_tasks"])
66 |
67 | for batch in self.data_obj.next_batch(t_in_ids_a, t_in_masks_a, t_seg_ids_a,
68 | t_in_ids_b, t_in_masks_b, t_seg_ids_b):
69 | loss, predictions = self.model.train(sess, batch)
70 | acc = accuracy(predictions)
71 | print("train: step: {}, loss: {}, acc: {}".format(current_step, loss, acc))
72 |
73 | current_step += 1
74 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
75 | e_in_ids_a, e_in_masks_a, e_seg_ids_a, e_in_ids_b, e_in_masks_b, e_seg_ids_b = \
76 | self.data_obj.gen_task_samples(self.queries, self.config["eval_n_tasks"])
77 | eval_losses = []
78 | eval_accs = []
79 |
80 | for eval_batch in self.data_obj.next_batch(e_in_ids_a, e_in_masks_a, e_seg_ids_a,
81 | e_in_ids_b, e_in_masks_b, e_seg_ids_b):
82 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch)
83 |
84 | eval_losses.append(eval_loss)
85 |
86 | acc = accuracy(eval_predictions)
87 | eval_accs.append(acc)
88 |
89 | print("\n")
90 | print("eval: loss: {}, acc: {}".format(mean(eval_losses), mean(eval_accs)))
91 | print("\n")
92 |
93 | if self.config["ckpt_model_path"]:
94 | save_path = self.config["ckpt_model_path"]
95 | if not os.path.exists(save_path):
96 | os.makedirs(save_path)
97 | model_save_path = os.path.join(save_path, self.config["model_name"])
98 | self.model.saver.save(sess, model_save_path, global_step=current_step)
99 |
100 |
101 | if __name__ == "__main__":
102 | # 读取用户在命令行输入的信息
103 | parser = argparse.ArgumentParser()
104 | parser.add_argument("--config_path", help="config path of model")
105 | args = parser.parse_args()
106 | trainer = Trainer(args)
107 | trainer.train()
108 |
--------------------------------------------------------------------------------
/albert_task/ltr_point_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_y, true_y):
18 | """
19 | 计算二类和多类的准确率
20 | :param pred_y: 预测结果
21 | :param true_y: 真实结果
22 | :return:
23 | """
24 | if isinstance(pred_y[0], list):
25 | pred_y = [item[0] for item in pred_y]
26 | corr = 0
27 | for i in range(len(pred_y)):
28 | if pred_y[i] == true_y[i]:
29 | corr += 1
30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0
31 | return acc
32 |
33 |
34 | def binary_auc(pred_y, true_y):
35 | """
36 | 二类别的auc值
37 | :param pred_y: 预测结果
38 | :param true_y: 真实结果
39 | :return:
40 | """
41 | auc = roc_auc_score(true_y, pred_y)
42 | return auc
43 |
44 |
45 | def binary_precision(pred_y, true_y, positive=1):
46 | """
47 | 二类的精确率计算
48 | :param pred_y: 预测结果
49 | :param true_y: 真实结果
50 | :param positive: 正例的索引表示
51 | :return:
52 | """
53 | corr = 0
54 | pred_corr = 0
55 | for i in range(len(pred_y)):
56 | if pred_y[i] == positive:
57 | pred_corr += 1
58 | if pred_y[i] == true_y[i]:
59 | corr += 1
60 |
61 | prec = corr / pred_corr if pred_corr > 0 else 0
62 | return prec
63 |
64 |
65 | def binary_recall(pred_y, true_y, positive=1):
66 | """
67 | 二类的召回率
68 | :param pred_y: 预测结果
69 | :param true_y: 真实结果
70 | :param positive: 正例的索引表示
71 | :return:
72 | """
73 | corr = 0
74 | true_corr = 0
75 | for i in range(len(pred_y)):
76 | if true_y[i] == positive:
77 | true_corr += 1
78 | if pred_y[i] == true_y[i]:
79 | corr += 1
80 |
81 | rec = corr / true_corr if true_corr > 0 else 0
82 | return rec
83 |
84 |
85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
86 | """
87 | 二类的f beta值
88 | :param pred_y: 预测结果
89 | :param true_y: 真实结果
90 | :param beta: beta值
91 | :param positive: 正例的索引表示
92 | :return:
93 | """
94 | precision = binary_precision(pred_y, true_y, positive)
95 | recall = binary_recall(pred_y, true_y, positive)
96 | try:
97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
98 | except:
99 | f_b = 0
100 | return f_b
101 |
102 |
103 | def multi_precision(pred_y, true_y, labels):
104 | """
105 | 多类的精确率
106 | :param pred_y: 预测结果
107 | :param true_y: 真实结果
108 | :param labels: 标签列表
109 | :return:
110 | """
111 | if isinstance(pred_y[0], list):
112 | pred_y = [item[0] for item in pred_y]
113 |
114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels]
115 | prec = mean(precisions)
116 | return prec
117 |
118 |
119 | def multi_recall(pred_y, true_y, labels):
120 | """
121 | 多类的召回率
122 | :param pred_y: 预测结果
123 | :param true_y: 真实结果
124 | :param labels: 标签列表
125 | :return:
126 | """
127 | if isinstance(pred_y[0], list):
128 | pred_y = [item[0] for item in pred_y]
129 |
130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels]
131 | rec = mean(recalls)
132 | return rec
133 |
134 |
135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0):
136 | """
137 | 多类的f beta值
138 | :param pred_y: 预测结果
139 | :param true_y: 真实结果
140 | :param labels: 标签列表
141 | :param beta: beta值
142 | :return:
143 | """
144 | if isinstance(pred_y[0], list):
145 | pred_y = [item[0] for item in pred_y]
146 |
147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
148 | f_beta = mean(f_betas)
149 | return f_beta
150 |
151 |
152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0):
153 | """
154 | 得到二分类的性能指标
155 | :param pred_y:
156 | :param true_y:
157 | :param f_beta:
158 | :return:
159 | """
160 | acc = accuracy(pred_y, true_y)
161 | auc = binary_auc(pred_y, true_y)
162 | recall = binary_recall(pred_y, true_y)
163 | precision = binary_precision(pred_y, true_y)
164 | f_beta = binary_f_beta(pred_y, true_y, f_beta)
165 | return acc, auc, recall, precision, f_beta
166 |
167 |
168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
169 | """
170 | 得到多分类的性能指标
171 | :param pred_y:
172 | :param true_y:
173 | :param labels:
174 | :param f_beta:
175 | :return:
176 | """
177 | acc = accuracy(pred_y, true_y)
178 | recall = multi_recall(pred_y, true_y, labels)
179 | precision = multi_precision(pred_y, true_y, labels)
180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
181 | return acc, recall, precision, f_beta
--------------------------------------------------------------------------------
/bert_task/classifier_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_y, true_y):
18 | """
19 | 计算二类和多类的准确率
20 | :param pred_y: 预测结果
21 | :param true_y: 真实结果
22 | :return:
23 | """
24 | if isinstance(pred_y[0], list):
25 | pred_y = [item[0] for item in pred_y]
26 | corr = 0
27 | for i in range(len(pred_y)):
28 | if pred_y[i] == true_y[i]:
29 | corr += 1
30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0
31 | return acc
32 |
33 |
34 | def binary_auc(pred_y, true_y):
35 | """
36 | 二类别的auc值
37 | :param pred_y: 预测结果
38 | :param true_y: 真实结果
39 | :return:
40 | """
41 | auc = roc_auc_score(true_y, pred_y)
42 | return auc
43 |
44 |
45 | def binary_precision(pred_y, true_y, positive=1):
46 | """
47 | 二类的精确率计算
48 | :param pred_y: 预测结果
49 | :param true_y: 真实结果
50 | :param positive: 正例的索引表示
51 | :return:
52 | """
53 | corr = 0
54 | pred_corr = 0
55 | for i in range(len(pred_y)):
56 | if pred_y[i] == positive:
57 | pred_corr += 1
58 | if pred_y[i] == true_y[i]:
59 | corr += 1
60 |
61 | prec = corr / pred_corr if pred_corr > 0 else 0
62 | return prec
63 |
64 |
65 | def binary_recall(pred_y, true_y, positive=1):
66 | """
67 | 二类的召回率
68 | :param pred_y: 预测结果
69 | :param true_y: 真实结果
70 | :param positive: 正例的索引表示
71 | :return:
72 | """
73 | corr = 0
74 | true_corr = 0
75 | for i in range(len(pred_y)):
76 | if true_y[i] == positive:
77 | true_corr += 1
78 | if pred_y[i] == true_y[i]:
79 | corr += 1
80 |
81 | rec = corr / true_corr if true_corr > 0 else 0
82 | return rec
83 |
84 |
85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
86 | """
87 | 二类的f beta值
88 | :param pred_y: 预测结果
89 | :param true_y: 真实结果
90 | :param beta: beta值
91 | :param positive: 正例的索引表示
92 | :return:
93 | """
94 | precision = binary_precision(pred_y, true_y, positive)
95 | recall = binary_recall(pred_y, true_y, positive)
96 | try:
97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
98 | except:
99 | f_b = 0
100 | return f_b
101 |
102 |
103 | def multi_precision(pred_y, true_y, labels):
104 | """
105 | 多类的精确率
106 | :param pred_y: 预测结果
107 | :param true_y: 真实结果
108 | :param labels: 标签列表
109 | :return:
110 | """
111 | if isinstance(pred_y[0], list):
112 | pred_y = [item[0] for item in pred_y]
113 |
114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels]
115 | prec = mean(precisions)
116 | return prec
117 |
118 |
119 | def multi_recall(pred_y, true_y, labels):
120 | """
121 | 多类的召回率
122 | :param pred_y: 预测结果
123 | :param true_y: 真实结果
124 | :param labels: 标签列表
125 | :return:
126 | """
127 | if isinstance(pred_y[0], list):
128 | pred_y = [item[0] for item in pred_y]
129 |
130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels]
131 | rec = mean(recalls)
132 | return rec
133 |
134 |
135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0):
136 | """
137 | 多类的f beta值
138 | :param pred_y: 预测结果
139 | :param true_y: 真实结果
140 | :param labels: 标签列表
141 | :param beta: beta值
142 | :return:
143 | """
144 | if isinstance(pred_y[0], list):
145 | pred_y = [item[0] for item in pred_y]
146 |
147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
148 | f_beta = mean(f_betas)
149 | return f_beta
150 |
151 |
152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0):
153 | """
154 | 得到二分类的性能指标
155 | :param pred_y:
156 | :param true_y:
157 | :param f_beta:
158 | :return:
159 | """
160 | acc = accuracy(pred_y, true_y)
161 | auc = binary_auc(pred_y, true_y)
162 | recall = binary_recall(pred_y, true_y)
163 | precision = binary_precision(pred_y, true_y)
164 | f_beta = binary_f_beta(pred_y, true_y, f_beta)
165 | return acc, auc, recall, precision, f_beta
166 |
167 |
168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
169 | """
170 | 得到多分类的性能指标
171 | :param pred_y:
172 | :param true_y:
173 | :param labels:
174 | :param f_beta:
175 | :return:
176 | """
177 | acc = accuracy(pred_y, true_y)
178 | recall = multi_recall(pred_y, true_y, labels)
179 | precision = multi_precision(pred_y, true_y, labels)
180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
181 | return acc, recall, precision, f_beta
--------------------------------------------------------------------------------
/bert_task/ltr_point_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_y, true_y):
18 | """
19 | 计算二类和多类的准确率
20 | :param pred_y: 预测结果
21 | :param true_y: 真实结果
22 | :return:
23 | """
24 | if isinstance(pred_y[0], list):
25 | pred_y = [item[0] for item in pred_y]
26 | corr = 0
27 | for i in range(len(pred_y)):
28 | if pred_y[i] == true_y[i]:
29 | corr += 1
30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0
31 | return acc
32 |
33 |
34 | def binary_auc(pred_y, true_y):
35 | """
36 | 二类别的auc值
37 | :param pred_y: 预测结果
38 | :param true_y: 真实结果
39 | :return:
40 | """
41 | auc = roc_auc_score(true_y, pred_y)
42 | return auc
43 |
44 |
45 | def binary_precision(pred_y, true_y, positive=1):
46 | """
47 | 二类的精确率计算
48 | :param pred_y: 预测结果
49 | :param true_y: 真实结果
50 | :param positive: 正例的索引表示
51 | :return:
52 | """
53 | corr = 0
54 | pred_corr = 0
55 | for i in range(len(pred_y)):
56 | if pred_y[i] == positive:
57 | pred_corr += 1
58 | if pred_y[i] == true_y[i]:
59 | corr += 1
60 |
61 | prec = corr / pred_corr if pred_corr > 0 else 0
62 | return prec
63 |
64 |
65 | def binary_recall(pred_y, true_y, positive=1):
66 | """
67 | 二类的召回率
68 | :param pred_y: 预测结果
69 | :param true_y: 真实结果
70 | :param positive: 正例的索引表示
71 | :return:
72 | """
73 | corr = 0
74 | true_corr = 0
75 | for i in range(len(pred_y)):
76 | if true_y[i] == positive:
77 | true_corr += 1
78 | if pred_y[i] == true_y[i]:
79 | corr += 1
80 |
81 | rec = corr / true_corr if true_corr > 0 else 0
82 | return rec
83 |
84 |
85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
86 | """
87 | 二类的f beta值
88 | :param pred_y: 预测结果
89 | :param true_y: 真实结果
90 | :param beta: beta值
91 | :param positive: 正例的索引表示
92 | :return:
93 | """
94 | precision = binary_precision(pred_y, true_y, positive)
95 | recall = binary_recall(pred_y, true_y, positive)
96 | try:
97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
98 | except:
99 | f_b = 0
100 | return f_b
101 |
102 |
103 | def multi_precision(pred_y, true_y, labels):
104 | """
105 | 多类的精确率
106 | :param pred_y: 预测结果
107 | :param true_y: 真实结果
108 | :param labels: 标签列表
109 | :return:
110 | """
111 | if isinstance(pred_y[0], list):
112 | pred_y = [item[0] for item in pred_y]
113 |
114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels]
115 | prec = mean(precisions)
116 | return prec
117 |
118 |
119 | def multi_recall(pred_y, true_y, labels):
120 | """
121 | 多类的召回率
122 | :param pred_y: 预测结果
123 | :param true_y: 真实结果
124 | :param labels: 标签列表
125 | :return:
126 | """
127 | if isinstance(pred_y[0], list):
128 | pred_y = [item[0] for item in pred_y]
129 |
130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels]
131 | rec = mean(recalls)
132 | return rec
133 |
134 |
135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0):
136 | """
137 | 多类的f beta值
138 | :param pred_y: 预测结果
139 | :param true_y: 真实结果
140 | :param labels: 标签列表
141 | :param beta: beta值
142 | :return:
143 | """
144 | if isinstance(pred_y[0], list):
145 | pred_y = [item[0] for item in pred_y]
146 |
147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
148 | f_beta = mean(f_betas)
149 | return f_beta
150 |
151 |
152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0):
153 | """
154 | 得到二分类的性能指标
155 | :param pred_y:
156 | :param true_y:
157 | :param f_beta:
158 | :return:
159 | """
160 | acc = accuracy(pred_y, true_y)
161 | auc = binary_auc(pred_y, true_y)
162 | recall = binary_recall(pred_y, true_y)
163 | precision = binary_precision(pred_y, true_y)
164 | f_beta = binary_f_beta(pred_y, true_y, f_beta)
165 | return acc, auc, recall, precision, f_beta
166 |
167 |
168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
169 | """
170 | 得到多分类的性能指标
171 | :param pred_y:
172 | :param true_y:
173 | :param labels:
174 | :param f_beta:
175 | :return:
176 | """
177 | acc = accuracy(pred_y, true_y)
178 | recall = multi_recall(pred_y, true_y, labels)
179 | precision = multi_precision(pred_y, true_y, labels)
180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
181 | return acc, recall, precision, f_beta
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_y, true_y):
18 | """
19 | 计算二类和多类的准确率
20 | :param pred_y: 预测结果
21 | :param true_y: 真实结果
22 | :return:
23 | """
24 | if isinstance(pred_y[0], list):
25 | pred_y = [item[0] for item in pred_y]
26 | corr = 0
27 | for i in range(len(pred_y)):
28 | if pred_y[i] == true_y[i]:
29 | corr += 1
30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0
31 | return acc
32 |
33 |
34 | def binary_auc(pred_y, true_y):
35 | """
36 | 二类别的auc值
37 | :param pred_y: 预测结果
38 | :param true_y: 真实结果
39 | :return:
40 | """
41 | auc = roc_auc_score(true_y, pred_y)
42 | return auc
43 |
44 |
45 | def binary_precision(pred_y, true_y, positive=1):
46 | """
47 | 二类的精确率计算
48 | :param pred_y: 预测结果
49 | :param true_y: 真实结果
50 | :param positive: 正例的索引表示
51 | :return:
52 | """
53 | corr = 0
54 | pred_corr = 0
55 | for i in range(len(pred_y)):
56 | if pred_y[i] == positive:
57 | pred_corr += 1
58 | if pred_y[i] == true_y[i]:
59 | corr += 1
60 |
61 | prec = corr / pred_corr if pred_corr > 0 else 0
62 | return prec
63 |
64 |
65 | def binary_recall(pred_y, true_y, positive=1):
66 | """
67 | 二类的召回率
68 | :param pred_y: 预测结果
69 | :param true_y: 真实结果
70 | :param positive: 正例的索引表示
71 | :return:
72 | """
73 | corr = 0
74 | true_corr = 0
75 | for i in range(len(pred_y)):
76 | if true_y[i] == positive:
77 | true_corr += 1
78 | if pred_y[i] == true_y[i]:
79 | corr += 1
80 |
81 | rec = corr / true_corr if true_corr > 0 else 0
82 | return rec
83 |
84 |
85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
86 | """
87 | 二类的f beta值
88 | :param pred_y: 预测结果
89 | :param true_y: 真实结果
90 | :param beta: beta值
91 | :param positive: 正例的索引表示
92 | :return:
93 | """
94 | precision = binary_precision(pred_y, true_y, positive)
95 | recall = binary_recall(pred_y, true_y, positive)
96 | try:
97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
98 | except:
99 | f_b = 0
100 | return f_b
101 |
102 |
103 | def multi_precision(pred_y, true_y, labels):
104 | """
105 | 多类的精确率
106 | :param pred_y: 预测结果
107 | :param true_y: 真实结果
108 | :param labels: 标签列表
109 | :return:
110 | """
111 | if isinstance(pred_y[0], list):
112 | pred_y = [item[0] for item in pred_y]
113 |
114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels]
115 | prec = mean(precisions)
116 | return prec
117 |
118 |
119 | def multi_recall(pred_y, true_y, labels):
120 | """
121 | 多类的召回率
122 | :param pred_y: 预测结果
123 | :param true_y: 真实结果
124 | :param labels: 标签列表
125 | :return:
126 | """
127 | if isinstance(pred_y[0], list):
128 | pred_y = [item[0] for item in pred_y]
129 |
130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels]
131 | rec = mean(recalls)
132 | return rec
133 |
134 |
135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0):
136 | """
137 | 多类的f beta值
138 | :param pred_y: 预测结果
139 | :param true_y: 真实结果
140 | :param labels: 标签列表
141 | :param beta: beta值
142 | :return:
143 | """
144 | if isinstance(pred_y[0], list):
145 | pred_y = [item[0] for item in pred_y]
146 |
147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
148 | f_beta = mean(f_betas)
149 | return f_beta
150 |
151 |
152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0):
153 | """
154 | 得到二分类的性能指标
155 | :param pred_y:
156 | :param true_y:
157 | :param f_beta:
158 | :return:
159 | """
160 | acc = accuracy(pred_y, true_y)
161 | auc = binary_auc(pred_y, true_y)
162 | recall = binary_recall(pred_y, true_y)
163 | precision = binary_precision(pred_y, true_y)
164 | f_beta = binary_f_beta(pred_y, true_y, f_beta)
165 | return acc, auc, recall, precision, f_beta
166 |
167 |
168 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
169 | """
170 | 得到多分类的性能指标
171 | :param pred_y:
172 | :param true_y:
173 | :param labels:
174 | :param f_beta:
175 | :return:
176 | """
177 | acc = accuracy(pred_y, true_y)
178 | recall = multi_recall(pred_y, true_y, labels)
179 | precision = multi_precision(pred_y, true_y, labels)
180 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
181 | return acc, recall, precision, f_beta
--------------------------------------------------------------------------------
/albert_task/classifier_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.dirname(os.getcwd()))
4 |
5 | import tensorflow as tf
6 |
7 | from albert import modeling
8 | from albert import optimization_finetuning as optimization
9 |
10 |
11 | class AlbertClassifier(object):
12 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
13 | self.__bert_config_path = os.path.join(config["bert_model_path"], "albert_config.json")
14 | self.__num_classes = config["num_classes"]
15 | self.__learning_rate = config["learning_rate"]
16 | self.__is_training = is_training
17 | self.__num_train_step = num_train_step
18 | self.__num_warmup_step = num_warmup_step
19 |
20 | self.config = config
21 |
22 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids')
23 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask')
24 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids')
25 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids")
26 |
27 | self.built_model()
28 | self.init_saver()
29 |
30 | def built_model(self):
31 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
32 |
33 | model = modeling.BertModel(config=bert_config,
34 | is_training=self.__is_training,
35 | input_ids=self.input_ids,
36 | input_mask=self.input_masks,
37 | token_type_ids=self.segment_ids,
38 | use_one_hot_embeddings=False)
39 | output_layer = model.get_pooled_output()
40 |
41 | hidden_size = output_layer.shape[-1].value
42 | if self.__is_training:
43 | # I.e., 0.1 dropout
44 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
45 |
46 | with tf.name_scope("output"):
47 | output_weights = tf.get_variable(
48 | "output_weights", [self.__num_classes, hidden_size],
49 | initializer=tf.truncated_normal_initializer(stddev=0.02))
50 |
51 | output_bias = tf.get_variable(
52 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer())
53 |
54 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
55 | logits = tf.nn.bias_add(logits, output_bias)
56 | self.predictions = tf.argmax(logits, axis=-1, name="predictions")
57 |
58 | if self.__is_training:
59 |
60 | with tf.name_scope("loss"):
61 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids)
62 | self.loss = tf.reduce_mean(losses, name="loss")
63 |
64 | with tf.name_scope('train_op'):
65 | self.train_op = optimization.create_optimizer(
66 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
67 |
68 | def init_saver(self):
69 | self.saver = tf.train.Saver(tf.global_variables())
70 |
71 | def train(self, sess, batch):
72 | """
73 | 训练模型
74 | :param sess: tf的会话对象
75 | :param batch: batch数据
76 | :return: 损失和预测结果
77 | """
78 |
79 | feed_dict = {self.input_ids: batch["input_ids"],
80 | self.input_masks: batch["input_masks"],
81 | self.segment_ids: batch["segment_ids"],
82 | self.label_ids: batch["label_ids"]}
83 |
84 | # 训练模型
85 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict)
86 | return loss, predictions
87 |
88 | def eval(self, sess, batch):
89 | """
90 | 验证模型
91 | :param sess: tf中的会话对象
92 | :param batch: batch数据
93 | :return: 损失和预测结果
94 | """
95 | feed_dict = {self.input_ids: batch["input_ids"],
96 | self.input_masks: batch["input_masks"],
97 | self.segment_ids: batch["segment_ids"],
98 | self.label_ids: batch["label_ids"]}
99 |
100 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict)
101 | return loss, predictions
102 |
103 | def infer(self, sess, batch):
104 | """
105 | 预测新数据
106 | :param sess: tf中的会话对象
107 | :param batch: batch数据
108 | :return: 预测结果
109 | """
110 | feed_dict = {self.input_ids: batch["input_ids"],
111 | self.input_masks: batch["input_masks"],
112 | self.segment_ids: batch["segment_ids"]}
113 |
114 | predict = sess.run(self.predictions, feed_dict=feed_dict)
115 |
116 | return predict
117 |
--------------------------------------------------------------------------------
/albert_task/classifier_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_y, true_y):
18 | """
19 | 计算二类和多类的准确率
20 | :param pred_y: 预测结果
21 | :param true_y: 真实结果
22 | :return:
23 | """
24 | if isinstance(pred_y[0], list):
25 | pred_y = [item[0] for item in pred_y]
26 | corr = 0
27 | for i in range(len(pred_y)):
28 | if pred_y[i] == true_y[i]:
29 | corr += 1
30 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0
31 | return acc
32 |
33 |
34 | def binary_auc(pred_y, true_y):
35 | """
36 | 二类别的auc值
37 | :param pred_y: 预测结果
38 | :param true_y: 真实结果
39 | :return:
40 | """
41 | auc = roc_auc_score(true_y, pred_y)
42 | return auc
43 |
44 |
45 | def binary_precision(pred_y, true_y, positive=1):
46 | """
47 | 二类的精确率计算
48 | :param pred_y: 预测结果
49 | :param true_y: 真实结果
50 | :param positive: 正例的索引表示
51 | :return:
52 | """
53 | corr = 0
54 | pred_corr = 0
55 | for i in range(len(pred_y)):
56 | if pred_y[i] == positive:
57 | pred_corr += 1
58 | if pred_y[i] == true_y[i]:
59 | corr += 1
60 |
61 | prec = corr / pred_corr if pred_corr > 0 else 0
62 | return prec
63 |
64 |
65 | def binary_recall(pred_y, true_y, positive=1):
66 | """
67 | 二类的召回率
68 | :param pred_y: 预测结果
69 | :param true_y: 真实结果
70 | :param positive: 正例的索引表示
71 | :return:
72 | """
73 | corr = 0
74 | true_corr = 0
75 | for i in range(len(pred_y)):
76 | if true_y[i] == positive:
77 | true_corr += 1
78 | if pred_y[i] == true_y[i]:
79 | corr += 1
80 |
81 | rec = corr / true_corr if true_corr > 0 else 0
82 | return rec
83 |
84 |
85 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
86 | """
87 | 二类的f beta值
88 | :param pred_y: 预测结果
89 | :param true_y: 真实结果
90 | :param beta: beta值
91 | :param positive: 正例的索引表示
92 | :return:
93 | """
94 | precision = binary_precision(pred_y, true_y, positive)
95 | recall = binary_recall(pred_y, true_y, positive)
96 | try:
97 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
98 | except:
99 | f_b = 0
100 | return f_b
101 |
102 |
103 | def multi_precision(pred_y, true_y, labels):
104 | """
105 | 多类的精确率
106 | :param pred_y: 预测结果
107 | :param true_y: 真实结果
108 | :param labels: 标签列表
109 | :return:
110 | """
111 | if isinstance(pred_y[0], list):
112 | pred_y = [item[0] for item in pred_y]
113 |
114 | precisions = [binary_precision(pred_y, true_y, label) for label in labels]
115 | prec = mean(precisions)
116 | return prec
117 |
118 |
119 | def multi_recall(pred_y, true_y, labels):
120 | """
121 | 多类的召回率
122 | :param pred_y: 预测结果
123 | :param true_y: 真实结果
124 | :param labels: 标签列表
125 | :return:
126 | """
127 | if isinstance(pred_y[0], list):
128 | pred_y = [item[0] for item in pred_y]
129 |
130 | recalls = [binary_recall(pred_y, true_y, label) for label in labels]
131 | rec = mean(recalls)
132 | return rec
133 |
134 |
135 | def multi_f_beta(pred_y, true_y, labels, beta=1.0):
136 | """
137 | 多类的f beta值
138 | :param pred_y: 预测结果
139 | :param true_y: 真实结果
140 | :param labels: 标签列表
141 | :param beta: beta值
142 | :return:
143 | """
144 | if isinstance(pred_y[0], list):
145 | pred_y = [item[0] for item in pred_y]
146 |
147 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
148 | f_beta = mean(f_betas)
149 | return f_beta
150 |
151 |
152 | def get_binary_metrics(pred_y, true_y, f_beta=1.0):
153 | """
154 | 得到二分类的性能指标
155 | :param pred_y:
156 | :param true_y:
157 | :param f_beta:
158 | :return:
159 | """
160 | pred_y = pred_y.tolist()
161 | acc = accuracy(pred_y, true_y)
162 | auc = binary_auc(pred_y, true_y)
163 | recall = binary_recall(pred_y, true_y)
164 | precision = binary_precision(pred_y, true_y)
165 | f_beta = binary_f_beta(pred_y, true_y, f_beta)
166 | return acc, auc, recall, precision, f_beta
167 |
168 |
169 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
170 | """
171 | 得到多分类的性能指标
172 | :param pred_y:
173 | :param true_y:
174 | :param labels:
175 | :param f_beta:
176 | :return:
177 | """
178 | pred_y = pred_y.tolist()
179 | acc = accuracy(pred_y, true_y)
180 | recall = multi_recall(pred_y, true_y, labels)
181 | precision = multi_precision(pred_y, true_y, labels)
182 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
183 | return acc, recall, precision, f_beta
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | 定义各类性能指标
3 | """
4 | from sklearn.metrics import roc_auc_score
5 |
6 |
7 | def mean(item: list) -> float:
8 | """
9 | 计算列表中元素的平均值
10 | :param item: 列表对象
11 | :return:
12 | """
13 | res = sum(item) / len(item) if len(item) > 0 else 0
14 | return res
15 |
16 |
17 | def accuracy(pred_y, true_y):
18 | """
19 | 计算二类和多类的准确率
20 | :param pred_y: 预测结果
21 | :param true_y: 真实结果
22 | :return:
23 | """
24 |
25 | if isinstance(pred_y[0], list):
26 | pred_y = [item[0] for item in pred_y]
27 | corr = 0
28 | for i in range(len(pred_y)):
29 | if pred_y[i] == true_y[i]:
30 | corr += 1
31 | acc = corr / len(pred_y) if len(pred_y) > 0 else 0
32 | return acc
33 |
34 |
35 | def binary_auc(pred_y, true_y):
36 | """
37 | 二类别的auc值
38 | :param pred_y: 预测结果
39 | :param true_y: 真实结果
40 | :return:
41 | """
42 | auc = roc_auc_score(true_y, pred_y)
43 | return auc
44 |
45 |
46 | def binary_precision(pred_y, true_y, positive=1):
47 | """
48 | 二类的精确率计算
49 | :param pred_y: 预测结果
50 | :param true_y: 真实结果
51 | :param positive: 正例的索引表示
52 | :return:
53 | """
54 | corr = 0
55 | pred_corr = 0
56 | for i in range(len(pred_y)):
57 | if pred_y[i] == positive:
58 | pred_corr += 1
59 | if pred_y[i] == true_y[i]:
60 | corr += 1
61 |
62 | prec = corr / pred_corr if pred_corr > 0 else 0
63 | return prec
64 |
65 |
66 | def binary_recall(pred_y, true_y, positive=1):
67 | """
68 | 二类的召回率
69 | :param pred_y: 预测结果
70 | :param true_y: 真实结果
71 | :param positive: 正例的索引表示
72 | :return:
73 | """
74 | corr = 0
75 | true_corr = 0
76 | for i in range(len(pred_y)):
77 | if true_y[i] == positive:
78 | true_corr += 1
79 | if pred_y[i] == true_y[i]:
80 | corr += 1
81 |
82 | rec = corr / true_corr if true_corr > 0 else 0
83 | return rec
84 |
85 |
86 | def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
87 | """
88 | 二类的f beta值
89 | :param pred_y: 预测结果
90 | :param true_y: 真实结果
91 | :param beta: beta值
92 | :param positive: 正例的索引表示
93 | :return:
94 | """
95 | precision = binary_precision(pred_y, true_y, positive)
96 | recall = binary_recall(pred_y, true_y, positive)
97 | try:
98 | f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
99 | except:
100 | f_b = 0
101 | return f_b
102 |
103 |
104 | def multi_precision(pred_y, true_y, labels):
105 | """
106 | 多类的精确率
107 | :param pred_y: 预测结果
108 | :param true_y: 真实结果
109 | :param labels: 标签列表
110 | :return:
111 | """
112 | if isinstance(pred_y[0], list):
113 | pred_y = [item[0] for item in pred_y]
114 |
115 | precisions = [binary_precision(pred_y, true_y, label) for label in labels]
116 | prec = mean(precisions)
117 | return prec
118 |
119 |
120 | def multi_recall(pred_y, true_y, labels):
121 | """
122 | 多类的召回率
123 | :param pred_y: 预测结果
124 | :param true_y: 真实结果
125 | :param labels: 标签列表
126 | :return:
127 | """
128 | if isinstance(pred_y[0], list):
129 | pred_y = [item[0] for item in pred_y]
130 |
131 | recalls = [binary_recall(pred_y, true_y, label) for label in labels]
132 | rec = mean(recalls)
133 | return rec
134 |
135 |
136 | def multi_f_beta(pred_y, true_y, labels, beta=1.0):
137 | """
138 | 多类的f beta值
139 | :param pred_y: 预测结果
140 | :param true_y: 真实结果
141 | :param labels: 标签列表
142 | :param beta: beta值
143 | :return:
144 | """
145 | if isinstance(pred_y[0].tolist(), list):
146 | pred_y = [item[0] for item in pred_y]
147 |
148 | f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
149 | f_beta = mean(f_betas)
150 | return f_beta
151 |
152 |
153 | def get_binary_metrics(pred_y, true_y, f_beta=1.0):
154 | """
155 | 得到二分类的性能指标
156 | :param pred_y:
157 | :param true_y:
158 | :param f_beta:
159 | :return:
160 | """
161 | pred_y = pred_y.tolist()
162 | acc = accuracy(pred_y, true_y)
163 | auc = binary_auc(pred_y, true_y)
164 | recall = binary_recall(pred_y, true_y)
165 | precision = binary_precision(pred_y, true_y)
166 | f_beta = binary_f_beta(pred_y, true_y, f_beta)
167 | return acc, auc, recall, precision, f_beta
168 |
169 |
170 | def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
171 | """
172 | 得到多分类的性能指标
173 | :param pred_y:
174 | :param true_y:
175 | :param labels:
176 | :param f_beta:
177 | :return:
178 | """
179 | pred_y = pred_y.tolist()
180 | acc = accuracy(pred_y, true_y)
181 | recall = multi_recall(pred_y, true_y, labels)
182 | precision = multi_precision(pred_y, true_y, labels)
183 | f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
184 | return acc, recall, precision, f_beta
--------------------------------------------------------------------------------
/bert_task/bert/tokenization_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import os
20 | import tempfile
21 | import tokenization
22 | import six
23 | import tensorflow as tf
24 |
25 |
26 | class TokenizationTest(tf.test.TestCase):
27 |
28 | def test_full_tokenizer(self):
29 | vocab_tokens = [
30 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
31 | "##ing", ","
32 | ]
33 | with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
34 | if six.PY2:
35 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
36 | else:
37 | vocab_writer.write("".join(
38 | [x + "\n" for x in vocab_tokens]).encode("utf-8"))
39 |
40 | vocab_file = vocab_writer.name
41 |
42 | tokenizer = tokenization.FullTokenizer(vocab_file)
43 | os.unlink(vocab_file)
44 |
45 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
46 | self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
47 |
48 | self.assertAllEqual(
49 | tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
50 |
51 | def test_chinese(self):
52 | tokenizer = tokenization.BasicTokenizer()
53 |
54 | self.assertAllEqual(
55 | tokenizer.tokenize(u"ah\u535A\u63A8zz"),
56 | [u"ah", u"\u535A", u"\u63A8", u"zz"])
57 |
58 | def test_basic_tokenizer_lower(self):
59 | tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
60 |
61 | self.assertAllEqual(
62 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
63 | ["hello", "!", "how", "are", "you", "?"])
64 | self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
65 |
66 | def test_basic_tokenizer_no_lower(self):
67 | tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
68 |
69 | self.assertAllEqual(
70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
71 | ["HeLLo", "!", "how", "Are", "yoU", "?"])
72 |
73 | def test_wordpiece_tokenizer(self):
74 | vocab_tokens = [
75 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
76 | "##ing"
77 | ]
78 |
79 | vocab = {}
80 | for (i, token) in enumerate(vocab_tokens):
81 | vocab[token] = i
82 | tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
83 |
84 | self.assertAllEqual(tokenizer.tokenize(""), [])
85 |
86 | self.assertAllEqual(
87 | tokenizer.tokenize("unwanted running"),
88 | ["un", "##want", "##ed", "runn", "##ing"])
89 |
90 | self.assertAllEqual(
91 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
92 |
93 | def test_convert_tokens_to_ids(self):
94 | vocab_tokens = [
95 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
96 | "##ing"
97 | ]
98 |
99 | vocab = {}
100 | for (i, token) in enumerate(vocab_tokens):
101 | vocab[token] = i
102 |
103 | self.assertAllEqual(
104 | tokenization.convert_tokens_to_ids(
105 | vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
106 |
107 | def test_is_whitespace(self):
108 | self.assertTrue(tokenization._is_whitespace(u" "))
109 | self.assertTrue(tokenization._is_whitespace(u"\t"))
110 | self.assertTrue(tokenization._is_whitespace(u"\r"))
111 | self.assertTrue(tokenization._is_whitespace(u"\n"))
112 | self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
113 |
114 | self.assertFalse(tokenization._is_whitespace(u"A"))
115 | self.assertFalse(tokenization._is_whitespace(u"-"))
116 |
117 | def test_is_control(self):
118 | self.assertTrue(tokenization._is_control(u"\u0005"))
119 |
120 | self.assertFalse(tokenization._is_control(u"A"))
121 | self.assertFalse(tokenization._is_control(u" "))
122 | self.assertFalse(tokenization._is_control(u"\t"))
123 | self.assertFalse(tokenization._is_control(u"\r"))
124 | self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
125 |
126 | def test_is_punctuation(self):
127 | self.assertTrue(tokenization._is_punctuation(u"-"))
128 | self.assertTrue(tokenization._is_punctuation(u"$"))
129 | self.assertTrue(tokenization._is_punctuation(u"`"))
130 | self.assertTrue(tokenization._is_punctuation(u"."))
131 |
132 | self.assertFalse(tokenization._is_punctuation(u"A"))
133 | self.assertFalse(tokenization._is_punctuation(u" "))
134 |
135 |
136 | if __name__ == "__main__":
137 | tf.test.main()
138 |
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/predict.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
5 |
6 | import tensorflow as tf
7 | from model import BertSentencePair
8 | from bert import tokenization
9 |
10 |
11 | class Predictor(object):
12 | def __init__(self, config):
13 | self.model = None
14 | self.config = config
15 |
16 | self.output_path = config["output_path"]
17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
18 | self.label_to_index = self.load_vocab()
19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()}
20 | self.word_vectors = None
21 | self.sequence_length = self.config["sequence_length"]
22 |
23 | # 创建模型
24 | self.create_model()
25 | # 加载计算图
26 | self.load_graph()
27 |
28 | def load_vocab(self):
29 | # 将词汇-索引映射表加载出来
30 |
31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
32 | label_to_index = json.load(f)
33 |
34 | return label_to_index
35 |
36 | def _truncate_seq_pair(self, tokens_a, tokens_b, max_len):
37 | """Truncates a sequence pair in place to the maximum length."""
38 |
39 | # This is a simple heuristic which will always truncate the longer sequence
40 | # one token at a time. This makes more sense than truncating an equal percent
41 | # of tokens from each, since if one sequence is very short then each token
42 | # that's truncated likely contains more information than a longer sequence.
43 | while True:
44 | total_length = len(tokens_a) + len(tokens_b)
45 | if total_length <= max_len:
46 | break
47 | if len(tokens_a) > len(tokens_b):
48 | tokens_a.pop()
49 | else:
50 | tokens_b.pop()
51 |
52 | def padding(self, input_id, input_mask, segment_id):
53 | """
54 | 对序列进行补全
55 | :param input_id:
56 | :param input_mask:
57 | :param segment_id:
58 | :return:
59 | """
60 |
61 | if len(input_id) < self.sequence_length:
62 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
63 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
64 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
65 | else:
66 | pad_input_id = input_id[:self.sequence_length]
67 | pad_input_mask = input_mask[:self.sequence_length]
68 | pad_segment_id = segment_id[:self.sequence_length]
69 |
70 | return pad_input_id, pad_input_mask, pad_segment_id
71 |
72 | def sentence_to_idx(self, text_a, text_b):
73 | """
74 | 将分词后的句子转换成idx表示
75 | :return:
76 | """
77 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
78 |
79 | text_a = tokenization.convert_to_unicode(text_a)
80 | text_b = tokenization.convert_to_unicode(text_b)
81 | tokens_a = tokenizer.tokenize(text_a)
82 | tokens_b = tokenizer.tokenize(text_b)
83 |
84 | # 判断两条序列组合在一起长度是否超过最大长度
85 | self._truncate_seq_pair(tokens_a, tokens_b, self.sequence_length - 3)
86 |
87 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
88 | input_id = tokenizer.convert_tokens_to_ids(tokens)
89 | input_mask = [1] * len(input_id)
90 | segment_id = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
91 |
92 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id)
93 |
94 | return [input_id], [input_mask], [segment_id]
95 |
96 | def load_graph(self):
97 | """
98 | 加载计算图
99 | :return:
100 | """
101 | self.sess = tf.Session()
102 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
103 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
104 | print('Reloading model parameters..')
105 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
106 | else:
107 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
108 |
109 | def create_model(self):
110 | """
111 | 根据config文件选择对应的模型,并初始化
112 | :return:
113 | """
114 | self.model = BertSentencePair(config=self.config, is_training=False)
115 |
116 | def predict(self, text_a, text_b):
117 | """
118 | 给定分词后的句子,预测其分类结果
119 | :param text_a:
120 | :param text_b:
121 | :return:
122 | """
123 | input_id, input_mask, segment_id = self.sentence_to_idx(text_a, text_b)
124 |
125 | prediction = self.model.infer(self.sess,
126 | dict(input_ids=input_id,
127 | input_masks=input_mask,
128 | segment_ids=segment_id)).tolist()[0][0]
129 | label = self.index_to_label[prediction]
130 | return label
131 |
132 |
133 |
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/predict.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
5 |
6 | import tensorflow as tf
7 | from model import AlbertSentencePair
8 | from albert import tokenization
9 |
10 |
11 | class Predictor(object):
12 | def __init__(self, config):
13 | self.model = None
14 | self.config = config
15 |
16 | self.output_path = config["output_path"]
17 | self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt")
18 | self.label_to_index = self.load_vocab()
19 | self.index_to_label = {value: key for key, value in self.label_to_index.items()}
20 | self.word_vectors = None
21 | self.sequence_length = self.config["sequence_length"]
22 |
23 | # 创建模型
24 | self.create_model()
25 | # 加载计算图
26 | self.load_graph()
27 |
28 | def load_vocab(self):
29 | # 将词汇-索引映射表加载出来
30 |
31 | with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f:
32 | label_to_index = json.load(f)
33 |
34 | return label_to_index
35 |
36 | def _truncate_seq_pair(self, tokens_a, tokens_b, max_len):
37 | """Truncates a sequence pair in place to the maximum length."""
38 |
39 | # This is a simple heuristic which will always truncate the longer sequence
40 | # one token at a time. This makes more sense than truncating an equal percent
41 | # of tokens from each, since if one sequence is very short then each token
42 | # that's truncated likely contains more information than a longer sequence.
43 | while True:
44 | total_length = len(tokens_a) + len(tokens_b)
45 | if total_length <= max_len:
46 | break
47 | if len(tokens_a) > len(tokens_b):
48 | tokens_a.pop()
49 | else:
50 | tokens_b.pop()
51 |
52 | def padding(self, input_id, input_mask, segment_id):
53 | """
54 | 对序列进行补全
55 | :param input_id:
56 | :param input_mask:
57 | :param segment_id:
58 | :return:
59 | """
60 |
61 | if len(input_id) < self.sequence_length:
62 | pad_input_id = input_id + [0] * (self.sequence_length - len(input_id))
63 | pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask))
64 | pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id))
65 | else:
66 | pad_input_id = input_id[:self.sequence_length]
67 | pad_input_mask = input_mask[:self.sequence_length]
68 | pad_segment_id = segment_id[:self.sequence_length]
69 |
70 | return pad_input_id, pad_input_mask, pad_segment_id
71 |
72 | def sentence_to_idx(self, text_a, text_b):
73 | """
74 | 将分词后的句子转换成idx表示
75 | :return:
76 | """
77 | tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
78 |
79 | text_a = tokenization.convert_to_unicode(text_a)
80 | text_b = tokenization.convert_to_unicode(text_b)
81 | tokens_a = tokenizer.tokenize(text_a)
82 | tokens_b = tokenizer.tokenize(text_b)
83 |
84 | # 判断两条序列组合在一起长度是否超过最大长度
85 | self._truncate_seq_pair(tokens_a, tokens_b, self.sequence_length - 3)
86 |
87 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
88 | input_id = tokenizer.convert_tokens_to_ids(tokens)
89 | input_mask = [1] * len(input_id)
90 | segment_id = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
91 |
92 | input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id)
93 |
94 | return [input_id], [input_mask], [segment_id]
95 |
96 | def load_graph(self):
97 | """
98 | 加载计算图
99 | :return:
100 | """
101 | self.sess = tf.Session()
102 | ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"])
103 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
104 | print('Reloading model parameters..')
105 | self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
106 | else:
107 | raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
108 |
109 | def create_model(self):
110 | """
111 | 根据config文件选择对应的模型,并初始化
112 | :return:
113 | """
114 | self.model = AlbertSentencePair(config=self.config, is_training=False)
115 |
116 | def predict(self, text_a, text_b):
117 | """
118 | 给定分词后的句子,预测其分类结果
119 | :param text_a:
120 | :param text_b:
121 | :return:
122 | """
123 | input_id, input_mask, segment_id = self.sentence_to_idx(text_a, text_b)
124 |
125 | prediction = self.model.infer(self.sess,
126 | dict(input_ids=input_id,
127 | input_masks=input_mask,
128 | segment_ids=segment_id)).tolist()[0][0]
129 | label = self.index_to_label[prediction]
130 | return label
131 |
132 |
133 |
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.dirname(os.getcwd()))
4 | import tensorflow as tf
5 |
6 | from bert import modeling
7 | from bert import optimization
8 |
9 |
10 | class BertSentencePair(object):
11 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
12 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json")
13 | self.__num_classes = config["num_classes"]
14 | self.__learning_rate = config["learning_rate"]
15 | self.__is_training = is_training
16 | self.__num_train_step = num_train_step
17 | self.__num_warmup_step = num_warmup_step
18 |
19 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids')
20 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask')
21 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids')
22 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids")
23 |
24 | self.built_model()
25 | self.init_saver()
26 |
27 | def built_model(self):
28 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
29 |
30 | model = modeling.BertModel(config=bert_config,
31 | is_training=self.__is_training,
32 | input_ids=self.input_ids,
33 | input_mask=self.input_masks,
34 | token_type_ids=self.segment_ids,
35 | use_one_hot_embeddings=False)
36 | output_layer = model.get_pooled_output()
37 |
38 | hidden_size = output_layer.shape[-1].value
39 | if self.__is_training:
40 | # I.e., 0.1 dropout
41 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
42 |
43 | with tf.name_scope("output"):
44 | output_weights = tf.get_variable(
45 | "output_weights", [self.__num_classes, hidden_size],
46 | initializer=tf.truncated_normal_initializer(stddev=0.02))
47 |
48 | output_bias = tf.get_variable(
49 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer())
50 |
51 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
52 | logits = tf.nn.bias_add(logits, output_bias)
53 | if self.__num_classes == 1:
54 | self.predictions = tf.cast(tf.greater_equal(logits, 0.0), dtype=tf.int32, name="predictions")
55 | else:
56 | self.predictions = tf.argmax(logits, axis=-1, name="predictions")
57 |
58 | if self.__is_training:
59 |
60 | with tf.name_scope("loss"):
61 | if self.__num_classes == 1:
62 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(logits, [-1]),
63 | labels=tf.cast(self.label_ids, dtype=tf.float32,))
64 | else:
65 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids)
66 | self.loss = tf.reduce_mean(losses, name="loss")
67 |
68 | with tf.name_scope('train_op'):
69 | self.train_op = optimization.create_optimizer(
70 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
71 |
72 | def init_saver(self):
73 | self.saver = tf.train.Saver(tf.global_variables())
74 |
75 | def train(self, sess, batch):
76 | """
77 | 训练模型
78 | :param sess: tf的会话对象
79 | :param batch: batch数据
80 | :return: 损失和预测结果
81 | """
82 |
83 | feed_dict = {self.input_ids: batch["input_ids"],
84 | self.input_masks: batch["input_masks"],
85 | self.segment_ids: batch["segment_ids"],
86 | self.label_ids: batch["label_ids"]}
87 |
88 | # 训练模型
89 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict)
90 | return loss, predictions
91 |
92 | def eval(self, sess, batch):
93 | """
94 | 验证模型
95 | :param sess: tf中的会话对象
96 | :param batch: batch数据
97 | :return: 损失和预测结果
98 | """
99 | feed_dict = {self.input_ids: batch["input_ids"],
100 | self.input_masks: batch["input_masks"],
101 | self.segment_ids: batch["segment_ids"],
102 | self.label_ids: batch["label_ids"]}
103 |
104 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict)
105 | return loss, predictions
106 |
107 | def infer(self, sess, batch):
108 | """
109 | 预测新数据
110 | :param sess: tf中的会话对象
111 | :param batch: batch数据
112 | :return: 预测结果
113 | """
114 | feed_dict = {self.input_ids: batch["input_ids"],
115 | self.input_masks: batch["input_masks"],
116 | self.segment_ids: batch["segment_ids"]}
117 |
118 | predict = sess.run(self.predictions, feed_dict=feed_dict)
119 |
120 | return predict
121 |
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.dirname(os.getcwd()))
4 | import tensorflow as tf
5 |
6 | from albert import modeling
7 | from albert import optimization
8 |
9 |
10 | class AlbertSentencePair(object):
11 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
12 | self.__bert_config_path = os.path.join(config["bert_model_path"], "albert_config.json")
13 | self.__num_classes = config["num_classes"]
14 | self.__learning_rate = config["learning_rate"]
15 | self.__is_training = is_training
16 | self.__num_train_step = num_train_step
17 | self.__num_warmup_step = num_warmup_step
18 |
19 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids')
20 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask')
21 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids')
22 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None], name="label_ids")
23 |
24 | self.built_model()
25 | self.init_saver()
26 |
27 | def built_model(self):
28 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
29 |
30 | model = modeling.BertModel(config=bert_config,
31 | is_training=self.__is_training,
32 | input_ids=self.input_ids,
33 | input_mask=self.input_masks,
34 | token_type_ids=self.segment_ids,
35 | use_one_hot_embeddings=False)
36 | output_layer = model.get_pooled_output()
37 |
38 | hidden_size = output_layer.shape[-1].value
39 | if self.__is_training:
40 | # I.e., 0.1 dropout
41 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
42 |
43 | with tf.name_scope("output"):
44 | output_weights = tf.get_variable(
45 | "output_weights", [self.__num_classes, hidden_size],
46 | initializer=tf.truncated_normal_initializer(stddev=0.02))
47 |
48 | output_bias = tf.get_variable(
49 | "output_bias", [self.__num_classes], initializer=tf.zeros_initializer())
50 |
51 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
52 | logits = tf.nn.bias_add(logits, output_bias)
53 | if self.__num_classes == 1:
54 | self.predictions = tf.cast(tf.greater_equal(logits, 0.0), dtype=tf.int32, name="predictions")
55 | else:
56 | self.predictions = tf.argmax(logits, axis=-1, name="predictions")
57 |
58 | if self.__is_training:
59 | with tf.name_scope("loss"):
60 | if self.__num_classes == 1:
61 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(logits, [-1]),
62 | labels=tf.cast(self.label_ids, dtype=tf.float32))
63 | else:
64 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.label_ids)
65 | self.loss = tf.reduce_mean(losses, name="loss")
66 |
67 | with tf.name_scope('train_op'):
68 | self.train_op = optimization.create_optimizer(
69 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
70 |
71 | def init_saver(self):
72 | self.saver = tf.train.Saver(tf.global_variables())
73 |
74 | def train(self, sess, batch):
75 | """
76 | 训练模型
77 | :param sess: tf的会话对象
78 | :param batch: batch数据
79 | :return: 损失和预测结果
80 | """
81 |
82 | feed_dict = {self.input_ids: batch["input_ids"],
83 | self.input_masks: batch["input_masks"],
84 | self.segment_ids: batch["segment_ids"],
85 | self.label_ids: batch["label_ids"]}
86 |
87 | # 训练模型
88 | _, loss, predictions = sess.run([self.train_op, self.loss, self.predictions], feed_dict=feed_dict)
89 | return loss, predictions
90 |
91 | def eval(self, sess, batch):
92 | """
93 | 验证模型
94 | :param sess: tf中的会话对象
95 | :param batch: batch数据
96 | :return: 损失和预测结果
97 | """
98 | feed_dict = {self.input_ids: batch["input_ids"],
99 | self.input_masks: batch["input_masks"],
100 | self.segment_ids: batch["segment_ids"],
101 | self.label_ids: batch["label_ids"]}
102 |
103 | loss, predictions = sess.run([self.loss, self.predictions], feed_dict=feed_dict)
104 | return loss, predictions
105 |
106 | def infer(self, sess, batch):
107 | """
108 | 预测新数据
109 | :param sess: tf中的会话对象
110 | :param batch: batch数据
111 | :return: 预测结果
112 | """
113 | feed_dict = {self.input_ids: batch["input_ids"],
114 | self.input_masks: batch["input_masks"],
115 | self.segment_ids: batch["segment_ids"]}
116 |
117 | predict = sess.run(self.predictions, feed_dict=feed_dict)
118 |
119 | return predict
120 |
--------------------------------------------------------------------------------
/albert_task/ner_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append(os.path.dirname(os.getcwd()))
5 | import tensorflow as tf
6 |
7 | from albert import modeling
8 | from albert import optimization_finetuning as optimization
9 | from bilstm_crf import BiLSTMCRF
10 |
11 |
12 | class ALBertNer(object):
13 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
14 | self.__bert_config_path = os.path.join(config["bert_model_path"], "albert_config.json")
15 | self.__num_classes = config["num_classes"]
16 | self.__learning_rate = config["learning_rate"]
17 | self.__ner_layers = config["ner_layers"]
18 | self.__ner_hidden_sizes = config["ner_hidden_sizes"]
19 | self.__max_len = config["sequence_length"]
20 | self.__is_training = is_training
21 | self.__num_train_step = num_train_step
22 | self.__num_warmup_step = num_warmup_step
23 |
24 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids')
25 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask')
26 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids')
27 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name="label_ids")
28 | self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None], name="sequence_len")
29 | self.keep_prob = tf.placeholder(dtype=tf.float32, shape=None, name="keep_prob")
30 |
31 | self.built_model()
32 | self.init_saver()
33 |
34 | def built_model(self):
35 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
36 |
37 | model = modeling.BertModel(config=bert_config,
38 | is_training=self.__is_training,
39 | input_ids=self.input_ids,
40 | input_mask=self.input_masks,
41 | token_type_ids=self.segment_ids,
42 | use_one_hot_embeddings=False)
43 |
44 | # 获取bert最后一层的输出
45 | output_layer = model.get_sequence_output()
46 |
47 | if self.__is_training:
48 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
49 |
50 | ner_model = BiLSTMCRF(embedded_chars=output_layer,
51 | hidden_sizes=self.__ner_hidden_sizes,
52 | layers=self.__ner_layers,
53 | keep_prob=self.keep_prob,
54 | num_labels=self.__num_classes,
55 | max_len=self.__max_len,
56 | labels=self.label_ids,
57 | sequence_lens=self.sequence_len,
58 | is_training=self.__is_training)
59 |
60 | self.loss, self.true_y, self.predictions = ner_model.construct_graph()
61 |
62 | if self.__is_training:
63 | with tf.name_scope('train_op'):
64 | self.train_op = optimization.create_optimizer(
65 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
66 |
67 | def init_saver(self):
68 | self.saver = tf.train.Saver(tf.global_variables())
69 |
70 | def train(self, sess, batch, dropout_rate):
71 | """
72 | 训练模型
73 | :param sess: tf的会话对象
74 | :param batch: batch数据
75 | :param dropout_rate: dropout rate
76 | :return: 损失和预测结果
77 | """
78 |
79 | feed_dict = {self.input_ids: batch["input_ids"],
80 | self.input_masks: batch["input_masks"],
81 | self.segment_ids: batch["segment_ids"],
82 | self.label_ids: batch["label_ids"],
83 | self.sequence_len: batch["sequence_len"],
84 | self.keep_prob: dropout_rate}
85 |
86 | # 训练模型
87 | _, loss, true_y, predictions = sess.run([self.train_op, self.loss, self.true_y, self.predictions],
88 | feed_dict=feed_dict)
89 | return loss, true_y, predictions
90 |
91 | def eval(self, sess, batch):
92 | """
93 | 验证模型
94 | :param sess: tf中的会话对象
95 | :param batch: batch数据
96 | :return: 损失和预测结果
97 | """
98 | feed_dict = {self.input_ids: batch["input_ids"],
99 | self.input_masks: batch["input_masks"],
100 | self.segment_ids: batch["segment_ids"],
101 | self.label_ids: batch["label_ids"],
102 | self.sequence_len: batch["sequence_len"],
103 | self.keep_prob: 1.0}
104 |
105 | loss, true_y, predictions = sess.run([self.loss, self.true_y, self.predictions], feed_dict=feed_dict)
106 | return loss, true_y, predictions
107 |
108 | def infer(self, sess, batch):
109 | """
110 | 预测新数据
111 | :param sess: tf中的会话对象
112 | :param batch: batch数据
113 | :return: 预测结果
114 | """
115 | feed_dict = {self.input_ids: batch["input_ids"],
116 | self.input_masks: batch["input_masks"],
117 | self.segment_ids: batch["segment_ids"],
118 | self.sequence_len: batch["sequence_len"],
119 | self.keep_prob: 1.0}
120 |
121 | predict = sess.run(self.predictions, feed_dict=feed_dict)
122 |
123 | return predict
124 |
--------------------------------------------------------------------------------
/bert_task/ner_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append(os.path.dirname(os.getcwd()))
5 | import tensorflow as tf
6 |
7 | from bert import modeling
8 | from bert import optimization
9 | from bilstm_crf import BiLSTMCRF
10 |
11 |
12 | class BertNer(object):
13 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
14 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json")
15 | self.__num_classes = config["num_classes"]
16 | self.__learning_rate = config["learning_rate"]
17 | self.__ner_layers = config["ner_layers"]
18 | self.__ner_hidden_sizes = config["ner_hidden_sizes"]
19 | self.__max_len = config["sequence_length"]
20 | self.__is_training = is_training
21 | self.__num_train_step = num_train_step
22 | self.__num_warmup_step = num_warmup_step
23 |
24 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_ids')
25 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_mask')
26 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='segment_ids')
27 | self.label_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name="label_ids")
28 | self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None], name="sequence_len")
29 | self.keep_prob = tf.placeholder(dtype=tf.float32, shape=None, name="keep_prob")
30 |
31 | self.built_model()
32 | self.init_saver()
33 |
34 | def built_model(self):
35 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
36 |
37 | model = modeling.BertModel(config=bert_config,
38 | is_training=self.__is_training,
39 | input_ids=self.input_ids,
40 | input_mask=self.input_masks,
41 | token_type_ids=self.segment_ids,
42 | use_one_hot_embeddings=False)
43 |
44 | # 获取bert最后一层的输出
45 | output_layer = model.get_sequence_output()
46 |
47 | hidden_size = output_layer.shape[-1].value
48 | if self.__is_training:
49 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
50 |
51 | ner_model = BiLSTMCRF(embedded_chars=output_layer,
52 | hidden_sizes=self.__ner_hidden_sizes,
53 | layers=self.__ner_layers,
54 | keep_prob=self.keep_prob,
55 | num_labels=self.__num_classes,
56 | max_len=self.__max_len,
57 | labels=self.label_ids,
58 | sequence_lens=self.sequence_len,
59 | is_training=self.__is_training)
60 |
61 | self.loss, self.true_y, self.predictions = ner_model.construct_graph()
62 |
63 | if self.__is_training:
64 | with tf.name_scope('train_op'):
65 | self.train_op = optimization.create_optimizer(
66 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
67 |
68 | def init_saver(self):
69 | self.saver = tf.train.Saver(tf.global_variables())
70 |
71 | def train(self, sess, batch, dropout_rate):
72 | """
73 | 训练模型
74 | :param sess: tf的会话对象
75 | :param batch: batch数据
76 | :param dropout_rate: dropout rate
77 | :return: 损失和预测结果
78 | """
79 |
80 | feed_dict = {self.input_ids: batch["input_ids"],
81 | self.input_masks: batch["input_masks"],
82 | self.segment_ids: batch["segment_ids"],
83 | self.label_ids: batch["label_ids"],
84 | self.sequence_len: batch["sequence_len"],
85 | self.keep_prob: dropout_rate}
86 |
87 | # 训练模型
88 | _, loss, true_y, predictions = sess.run([self.train_op, self.loss, self.true_y, self.predictions],
89 | feed_dict=feed_dict)
90 | return loss, true_y, predictions
91 |
92 | def eval(self, sess, batch):
93 | """
94 | 验证模型
95 | :param sess: tf中的会话对象
96 | :param batch: batch数据
97 | :return: 损失和预测结果
98 | """
99 | feed_dict = {self.input_ids: batch["input_ids"],
100 | self.input_masks: batch["input_masks"],
101 | self.segment_ids: batch["segment_ids"],
102 | self.label_ids: batch["label_ids"],
103 | self.sequence_len: batch["sequence_len"],
104 | self.keep_prob: 1.0}
105 |
106 | loss, true_y, predictions = sess.run([self.loss, self.true_y, self.predictions], feed_dict=feed_dict)
107 | return loss, true_y, predictions
108 |
109 | def infer(self, sess, batch):
110 | """
111 | 预测新数据
112 | :param sess: tf中的会话对象
113 | :param batch: batch数据
114 | :return: 预测结果
115 | """
116 | feed_dict = {self.input_ids: batch["input_ids"],
117 | self.input_masks: batch["input_masks"],
118 | self.segment_ids: batch["segment_ids"],
119 | self.sequence_len: batch["sequence_len"],
120 | self.keep_prob: 1.0}
121 |
122 | predict = sess.run(self.predictions, feed_dict=feed_dict)
123 |
124 | return predict
125 |
--------------------------------------------------------------------------------
/albert_task/albert/bert_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import collections
6 | import copy
7 | import json
8 | import math
9 | import re
10 | import six
11 | import tensorflow as tf
12 |
13 |
14 | def get_shape_list(tensor, expected_rank=None, name=None):
15 | """Returns a list of the shape of tensor, preferring static dimensions.
16 |
17 | Args:
18 | tensor: A tf.Tensor object to find the shape of.
19 | expected_rank: (optional) int. The expected rank of `tensor`. If this is
20 | specified and the `tensor` has a different rank, and exception will be
21 | thrown.
22 | name: Optional name of the tensor for the error message.
23 |
24 | Returns:
25 | A list of dimensions of the shape of tensor. All static dimensions will
26 | be returned as python integers, and dynamic dimensions will be returned
27 | as tf.Tensor scalars.
28 | """
29 | if name is None:
30 | name = tensor.name
31 |
32 | if expected_rank is not None:
33 | assert_rank(tensor, expected_rank, name)
34 |
35 | shape = tensor.shape.as_list()
36 |
37 | non_static_indexes = []
38 | for (index, dim) in enumerate(shape):
39 | if dim is None:
40 | non_static_indexes.append(index)
41 |
42 | if not non_static_indexes:
43 | return shape
44 |
45 | dyn_shape = tf.shape(tensor)
46 | for index in non_static_indexes:
47 | shape[index] = dyn_shape[index]
48 | return shape
49 |
50 |
51 | def reshape_to_matrix(input_tensor):
52 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
53 | ndims = input_tensor.shape.ndims
54 | if ndims < 2:
55 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
56 | (input_tensor.shape))
57 | if ndims == 2:
58 | return input_tensor
59 |
60 | width = input_tensor.shape[-1]
61 | output_tensor = tf.reshape(input_tensor, [-1, width])
62 | return output_tensor
63 |
64 |
65 | def reshape_from_matrix(output_tensor, orig_shape_list):
66 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
67 | if len(orig_shape_list) == 2:
68 | return output_tensor
69 |
70 | output_shape = get_shape_list(output_tensor)
71 |
72 | orig_dims = orig_shape_list[0:-1]
73 | width = output_shape[-1]
74 |
75 | return tf.reshape(output_tensor, orig_dims + [width])
76 |
77 |
78 | def assert_rank(tensor, expected_rank, name=None):
79 | """Raises an exception if the tensor rank is not of the expected rank.
80 |
81 | Args:
82 | tensor: A tf.Tensor to check the rank of.
83 | expected_rank: Python integer or list of integers, expected rank.
84 | name: Optional name of the tensor for the error message.
85 |
86 | Raises:
87 | ValueError: If the expected shape doesn't match the actual shape.
88 | """
89 | if name is None:
90 | name = tensor.name
91 |
92 | expected_rank_dict = {}
93 | if isinstance(expected_rank, six.integer_types):
94 | expected_rank_dict[expected_rank] = True
95 | else:
96 | for x in expected_rank:
97 | expected_rank_dict[x] = True
98 |
99 | actual_rank = tensor.shape.ndims
100 | if actual_rank not in expected_rank_dict:
101 | scope_name = tf.get_variable_scope().name
102 | raise ValueError(
103 | "For the tensor `%s` in scope `%s`, the actual rank "
104 | "`%d` (shape = %s) is not equal to the expected rank `%s`" %
105 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
106 |
107 |
108 | def gather_indexes(sequence_tensor, positions):
109 | """Gathers the vectors at the specific positions over a minibatch."""
110 | sequence_shape = get_shape_list(sequence_tensor, expected_rank=3)
111 | batch_size = sequence_shape[0]
112 | seq_length = sequence_shape[1]
113 | width = sequence_shape[2]
114 |
115 | flat_offsets = tf.reshape(
116 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
117 | flat_positions = tf.reshape(positions + flat_offsets, [-1])
118 | flat_sequence_tensor = tf.reshape(sequence_tensor,
119 | [batch_size * seq_length, width])
120 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
121 | return output_tensor
122 |
123 |
124 | # add sequence mask for:
125 | # 1. random shuffle lm modeling---xlnet with random shuffled input
126 | # 2. left2right and right2left language modeling
127 | # 3. conditional generation
128 | def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs):
129 | if seq_type == 'seq2seq':
130 | if mask_sequence is not None:
131 | seq_shape = get_shape_list(mask_sequence, expected_rank=2)
132 | seq_len = seq_shape[1]
133 | ones = tf.ones((1, seq_len, seq_len))
134 | a_mask = tf.matrix_band_part(ones, -1, 0)
135 | s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2)
136 | s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3)
137 | a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask
138 | # generate mask of batch x seq_len x seq_len
139 | a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len))
140 | out_mask = attention_mask * a_mask
141 | else:
142 | ones = tf.ones_like(attention_mask[:1])
143 | mask = (tf.matrix_band_part(ones, -1, 0))
144 | out_mask = attention_mask * mask
145 | else:
146 | out_mask = attention_mask
147 |
148 | return out_mask
149 |
--------------------------------------------------------------------------------
/albert_task/machine_reading_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import time
5 | import sys
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 | import tensorflow as tf
9 | from albert import modeling
10 | from model import AlbertMachineReading
11 | from data_helper import TrainData
12 | from metrics import get_eval, write_predictions
13 |
14 |
15 | class Trainer(object):
16 | def __init__(self, args):
17 | self.args = args
18 | with open(args.config_path, "r") as fr:
19 | self.config = json.load(fr)
20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt")
21 |
22 | # 加载数据集
23 | self.data_obj = self.load_data()
24 | self.t_features = self.data_obj.gen_data(self.config["train_data"])
25 |
26 | self.e_examples, self.e_features = self.data_obj.gen_data(self.config["eval_data"], is_training=False)
27 | print("train data size: {}".format(len(self.t_features)))
28 | print("eval data size: {}".format(len(self.e_features)))
29 |
30 | num_train_steps = int(
31 | len(self.t_features) / self.config["batch_size"] * self.config["epochs"])
32 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
33 | # 初始化模型对象
34 | self.model = self.create_model(num_train_steps, num_warmup_steps)
35 |
36 | def load_data(self):
37 | """
38 | 创建数据对象
39 | :return:
40 | """
41 | # 生成训练集对象并生成训练数据
42 | data_obj = TrainData(self.config)
43 | return data_obj
44 |
45 | def create_model(self, num_train_step, num_warmup_step):
46 | """
47 | 根据config文件选择对应的模型,并初始化
48 | :return:
49 | """
50 | model = AlbertMachineReading(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
51 | return model
52 |
53 | def train(self):
54 | with tf.Session() as sess:
55 | tvars = tf.trainable_variables()
56 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
57 | tvars, self.__bert_checkpoint_path)
58 | print("init bert model params")
59 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
60 | print("init bert model params done")
61 | sess.run(tf.variables_initializer(tf.global_variables()))
62 |
63 | current_step = 0
64 | start = time.time()
65 | for epoch in range(self.config["epochs"]):
66 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
67 |
68 | for batch in self.data_obj.next_batch(self.t_features):
69 | loss, start_logits, end_logits = self.model.train(sess, batch)
70 | # print("start: ", start_logits)
71 | # print("end: ", end_logits)
72 | print("train: step: {}, loss: {}".format(current_step, loss))
73 |
74 | current_step += 1
75 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
76 |
77 | all_results = []
78 | for eval_batch in self.data_obj.next_batch(self.e_features, is_training=False):
79 | start_logits, end_logits = self.model.eval(sess, eval_batch)
80 |
81 | for unique_id, start_logit, end_logit in zip(eval_batch["unique_id"],
82 | start_logits,
83 | end_logits):
84 | all_results.append(dict(unique_id=unique_id,
85 | start_logits=start_logit.tolist(),
86 | end_logits=end_logit.tolist()))
87 |
88 | with open("output/cmrc2018/results.json", "w", encoding="utf8") as fw:
89 | json.dump(all_results, fw, indent=4, ensure_ascii=False)
90 |
91 | write_predictions(all_examples=self.e_examples,
92 | all_features=self.e_features,
93 | all_results=all_results,
94 | n_best_size=self.config["n_best_size"],
95 | max_answer_length=self.config["max_answer_length"],
96 | output_prediction_file=self.config["output_predictions_path"],
97 | output_nbest_file=self.config["output_nbest_path"])
98 |
99 | result = get_eval(original_file=self.config["eval_data"],
100 | prediction_file=self.config["output_predictions_path"])
101 |
102 | print("\n")
103 | print("eval: step: {}, f1: {}, em: {}".format(current_step, result["f1"], result["em"]))
104 | print("\n")
105 |
106 | if self.config["ckpt_model_path"]:
107 | save_path = self.config["ckpt_model_path"]
108 | if not os.path.exists(save_path):
109 | os.makedirs(save_path)
110 | model_save_path = os.path.join(save_path, self.config["model_name"])
111 | self.model.saver.save(sess, model_save_path, global_step=current_step)
112 |
113 | end = time.time()
114 | print("total train time: ", end - start)
115 |
116 |
117 | if __name__ == "__main__":
118 | # 读取用户在命令行输入的信息
119 | parser = argparse.ArgumentParser()
120 | parser.add_argument("--config_path", help="config path of model")
121 | args = parser.parse_args()
122 | trainer = Trainer(args)
123 | trainer.train()
124 |
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import time
5 | import collections
6 | import sys
7 |
8 | sys.path.append(os.path.dirname(os.getcwd()))
9 | import tensorflow as tf
10 | from bert import modeling
11 | from model import BertMachineReading
12 | from data_helper import TrainData
13 | from metrics import get_eval, write_predictions
14 |
15 |
16 | class Trainer(object):
17 | def __init__(self, args):
18 | self.args = args
19 | with open(args.config_path, "r") as fr:
20 | self.config = json.load(fr)
21 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt")
22 |
23 | # 加载数据集
24 | self.data_obj = self.load_data()
25 | self.t_features = self.data_obj.gen_data(self.config["train_data"])
26 |
27 | self.e_examples, self.e_features = self.data_obj.gen_data(self.config["eval_data"], is_training=False)
28 | print("train data size: {}".format(len(self.t_features)))
29 | print("eval data size: {}".format(len(self.e_features)))
30 |
31 | num_train_steps = int(
32 | len(self.t_features) / self.config["batch_size"] * self.config["epochs"])
33 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
34 | # 初始化模型对象
35 | self.model = self.create_model(num_train_steps, num_warmup_steps)
36 |
37 | def load_data(self):
38 | """
39 | 创建数据对象
40 | :return:
41 | """
42 | # 生成训练集对象并生成训练数据
43 | data_obj = TrainData(self.config)
44 | return data_obj
45 |
46 | def create_model(self, num_train_step, num_warmup_step):
47 | """
48 | 根据config文件选择对应的模型,并初始化
49 | :return:
50 | """
51 | model = BertMachineReading(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
52 | return model
53 |
54 | def train(self):
55 | with tf.Session() as sess:
56 | tvars = tf.trainable_variables()
57 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
58 | tvars, self.__bert_checkpoint_path)
59 | print("init bert model params")
60 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
61 | print("init bert model params done")
62 | sess.run(tf.variables_initializer(tf.global_variables()))
63 |
64 | current_step = 0
65 | start = time.time()
66 | for epoch in range(self.config["epochs"]):
67 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
68 |
69 | for batch in self.data_obj.next_batch(self.t_features):
70 | loss, start_logits, end_logits = self.model.train(sess, batch)
71 | # print("start: ", start_logits)
72 | # print("end: ", end_logits)
73 | print("train: step: {}, loss: {}".format(current_step, loss))
74 |
75 | current_step += 1
76 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
77 |
78 | all_results = []
79 | for eval_batch in self.data_obj.next_batch(self.e_features, is_training=False):
80 | start_logits, end_logits = self.model.eval(sess, eval_batch)
81 |
82 | for unique_id, start_logit, end_logit in zip(eval_batch["unique_id"],
83 | start_logits,
84 | end_logits):
85 | all_results.append(dict(unique_id=unique_id,
86 | start_logits=start_logit.tolist(),
87 | end_logits=end_logit.tolist()))
88 |
89 | with open("output/cmrc2018/results.json", "w", encoding="utf8") as fw:
90 | json.dump(all_results, fw, indent=4, ensure_ascii=False)
91 |
92 | write_predictions(all_examples=self.e_examples,
93 | all_features=self.e_features,
94 | all_results=all_results,
95 | n_best_size=self.config["n_best_size"],
96 | max_answer_length=self.config["max_answer_length"],
97 | output_prediction_file=self.config["output_predictions_path"],
98 | output_nbest_file=self.config["output_nbest_path"])
99 |
100 | result = get_eval(original_file=self.config["eval_data"],
101 | prediction_file=self.config["output_predictions_path"])
102 |
103 | print("\n")
104 | print("eval: step: {}, f1: {}, em: {}".format(current_step, result["f1"], result["em"]))
105 | print("\n")
106 |
107 | if self.config["ckpt_model_path"]:
108 | save_path = self.config["ckpt_model_path"]
109 | if not os.path.exists(save_path):
110 | os.makedirs(save_path)
111 | model_save_path = os.path.join(save_path, self.config["model_name"])
112 | self.model.saver.save(sess, model_save_path, global_step=current_step)
113 |
114 | end = time.time()
115 | print("total train time: ", end - start)
116 |
117 |
118 | if __name__ == "__main__":
119 | # 读取用户在命令行输入的信息
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument("--config_path", help="config path of model")
122 | args = parser.parse_args()
123 | trainer = Trainer(args)
124 | trainer.train()
125 |
--------------------------------------------------------------------------------
/bert_task/classifier_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import time
5 | import sys
6 | sys.path.append(os.path.dirname(os.getcwd()))
7 | import tensorflow as tf
8 | from bert import modeling
9 | from model import BertClassifier
10 | from data_helper import TrainData
11 | from metrics import mean, get_multi_metrics
12 |
13 |
14 | class Trainer(object):
15 | def __init__(self, args):
16 | self.args = args
17 | with open(args.config_path, "r") as fr:
18 | self.config = json.load(fr)
19 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt")
20 |
21 | # 加载数据集
22 | self.data_obj = self.load_data()
23 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data(
24 | self.config["train_data"])
25 |
26 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data(
27 | self.config["eval_data"], is_training=False)
28 | print("train data size: {}".format(len(self.t_lab_ids)))
29 | print("eval data size: {}".format(len(self.e_lab_ids)))
30 | self.label_list = [value for key, value in lab_to_idx.items()]
31 | print("label numbers: ", len(self.label_list))
32 |
33 | num_train_steps = int(
34 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"])
35 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
36 | # 初始化模型对象
37 | self.model = self.create_model(num_train_steps, num_warmup_steps)
38 |
39 | def load_data(self):
40 | """
41 | 创建数据对象
42 | :return:
43 | """
44 | # 生成训练集对象并生成训练数据
45 | data_obj = TrainData(self.config)
46 | return data_obj
47 |
48 | def create_model(self, num_train_step, num_warmup_step):
49 | """
50 | 根据config文件选择对应的模型,并初始化
51 | :return:
52 | """
53 | model = BertClassifier(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
54 | return model
55 |
56 | def train(self):
57 | with tf.Session() as sess:
58 | tvars = tf.trainable_variables()
59 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
60 | tvars, self.__bert_checkpoint_path)
61 | print("init bert model params")
62 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
63 | print("init bert model params done")
64 | sess.run(tf.variables_initializer(tf.global_variables()))
65 |
66 | current_step = 0
67 | start = time.time()
68 | for epoch in range(self.config["epochs"]):
69 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
70 |
71 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids):
72 | loss, predictions = self.model.train(sess, batch)
73 |
74 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"],
75 | labels=self.label_list)
76 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
77 | current_step, loss, acc, recall, prec, f_beta))
78 |
79 | current_step += 1
80 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
81 |
82 | eval_losses = []
83 | eval_accs = []
84 | eval_aucs = []
85 | eval_recalls = []
86 | eval_precs = []
87 | eval_f_betas = []
88 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks,
89 | self.e_seg_ids, self.e_lab_ids):
90 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch)
91 |
92 | eval_losses.append(eval_loss)
93 |
94 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions,
95 | true_y=eval_batch["label_ids"],
96 | labels=self.label_list)
97 | eval_accs.append(acc)
98 | eval_recalls.append(recall)
99 | eval_precs.append(prec)
100 | eval_f_betas.append(f_beta)
101 | print("\n")
102 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format(
103 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls),
104 | mean(eval_precs), mean(eval_f_betas)))
105 | print("\n")
106 |
107 | if self.config["ckpt_model_path"]:
108 | save_path = self.config["ckpt_model_path"]
109 | if not os.path.exists(save_path):
110 | os.makedirs(save_path)
111 | model_save_path = os.path.join(save_path, self.config["model_name"])
112 | self.model.saver.save(sess, model_save_path, global_step=current_step)
113 |
114 | end = time.time()
115 | print("total train time: ", end - start)
116 |
117 |
118 | if __name__ == "__main__":
119 | # 读取用户在命令行输入的信息
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument("--config_path", help="config path of model")
122 | args = parser.parse_args()
123 | trainer = Trainer(args)
124 | trainer.train()
125 |
126 |
--------------------------------------------------------------------------------
/albert_task/sentence_pair_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import sys
5 | import time
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 | import tensorflow as tf
9 | from albert import modeling
10 | from model import AlbertSentencePair
11 | from data_helper import TrainData
12 | from metrics import mean, get_multi_metrics
13 |
14 |
15 | class Trainer(object):
16 | def __init__(self, args):
17 | self.args = args
18 | with open(args.config_path, "r") as fr:
19 | self.config = json.load(fr)
20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt")
21 |
22 | # 加载数据集
23 | self.data_obj = self.load_data()
24 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data(
25 | self.config["train_data"])
26 |
27 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data(
28 | self.config["eval_data"], is_training=False)
29 | print("train data size: {}".format(len(self.t_lab_ids)))
30 | print("eval data size: {}".format(len(self.e_lab_ids)))
31 | self.label_list = [value for key, value in lab_to_idx.items()]
32 | print("label numbers: ", len(self.label_list))
33 |
34 | num_train_steps = int(
35 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"])
36 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
37 | # 初始化模型对象
38 | self.model = self.create_model(num_train_steps, num_warmup_steps)
39 |
40 | def load_data(self):
41 | """
42 | 创建数据对象
43 | :return:
44 | """
45 | # 生成训练集对象并生成训练数据
46 | data_obj = TrainData(self.config)
47 | return data_obj
48 |
49 | def create_model(self, num_train_step, num_warmup_step):
50 | """
51 | 根据config文件选择对应的模型,并初始化
52 | :return:
53 | """
54 | model = AlbertSentencePair(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
55 | return model
56 |
57 | def train(self):
58 | with tf.Session() as sess:
59 | tvars = tf.trainable_variables()
60 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
61 | tvars, self.__bert_checkpoint_path)
62 | print("init bert model params")
63 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
64 | print("init bert model params done")
65 | sess.run(tf.variables_initializer(tf.global_variables()))
66 |
67 | current_step = 0
68 | start = time.time()
69 | for epoch in range(self.config["epochs"]):
70 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
71 |
72 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids):
73 | loss, predictions = self.model.train(sess, batch)
74 |
75 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"],
76 | labels=self.label_list)
77 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
78 | current_step, loss, acc, recall, prec, f_beta))
79 |
80 | current_step += 1
81 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
82 |
83 | eval_losses = []
84 | eval_accs = []
85 | eval_aucs = []
86 | eval_recalls = []
87 | eval_precs = []
88 | eval_f_betas = []
89 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks,
90 | self.e_seg_ids, self.e_lab_ids):
91 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch)
92 |
93 | eval_losses.append(eval_loss)
94 |
95 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions,
96 | true_y=eval_batch["label_ids"],
97 | labels=self.label_list)
98 | eval_accs.append(acc)
99 | eval_recalls.append(recall)
100 | eval_precs.append(prec)
101 | eval_f_betas.append(f_beta)
102 | print("\n")
103 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format(
104 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls),
105 | mean(eval_precs), mean(eval_f_betas)))
106 | print("\n")
107 |
108 | if self.config["ckpt_model_path"]:
109 | save_path = self.config["ckpt_model_path"]
110 | if not os.path.exists(save_path):
111 | os.makedirs(save_path)
112 | model_save_path = os.path.join(save_path, self.config["model_name"])
113 | self.model.saver.save(sess, model_save_path, global_step=current_step)
114 | end = time.time()
115 | print("total train time: ", end - start)
116 |
117 |
118 | if __name__ == "__main__":
119 | # 读取用户在命令行输入的信息
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument("--config_path", help="config path of model")
122 | args = parser.parse_args()
123 | trainer = Trainer(args)
124 | trainer.train()
125 |
--------------------------------------------------------------------------------
/bert_task/sentence_pair_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import time
5 | import sys
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 | import tensorflow as tf
9 | from bert import modeling
10 | from model import BertSentencePair
11 | from data_helper import TrainData
12 | from metrics import mean, get_multi_metrics
13 |
14 |
15 | class Trainer(object):
16 | def __init__(self, args):
17 | self.args = args
18 | with open(args.config_path, "r") as fr:
19 | self.config = json.load(fr)
20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt")
21 |
22 | # 加载数据集
23 | self.data_obj = self.load_data()
24 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data(
25 | self.config["train_data"])
26 |
27 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data(
28 | self.config["eval_data"], is_training=False)
29 | print("train data size: {}".format(len(self.t_lab_ids)))
30 | print("eval data size: {}".format(len(self.e_lab_ids)))
31 | self.label_list = [value for key, value in lab_to_idx.items()]
32 | print("label numbers: ", len(self.label_list))
33 |
34 | num_train_steps = int(
35 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"])
36 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
37 | # 初始化模型对象
38 | self.model = self.create_model(num_train_steps, num_warmup_steps)
39 |
40 | def load_data(self):
41 | """
42 | 创建数据对象
43 | :return:
44 | """
45 | # 生成训练集对象并生成训练数据
46 | data_obj = TrainData(self.config)
47 | return data_obj
48 |
49 | def create_model(self, num_train_step, num_warmup_step):
50 | """
51 | 根据config文件选择对应的模型,并初始化
52 | :return:
53 | """
54 | model = BertSentencePair(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
55 | return model
56 |
57 | def train(self):
58 | with tf.Session() as sess:
59 | tvars = tf.trainable_variables()
60 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
61 | tvars, self.__bert_checkpoint_path)
62 | print("init bert model params")
63 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
64 | print("init bert model params done")
65 | sess.run(tf.variables_initializer(tf.global_variables()))
66 |
67 | current_step = 0
68 | start = time.time()
69 | for epoch in range(self.config["epochs"]):
70 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
71 |
72 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids):
73 | loss, predictions = self.model.train(sess, batch)
74 |
75 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"],
76 | labels=self.label_list)
77 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
78 | current_step, loss, acc, recall, prec, f_beta))
79 |
80 | current_step += 1
81 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
82 |
83 | eval_losses = []
84 | eval_accs = []
85 | eval_aucs = []
86 | eval_recalls = []
87 | eval_precs = []
88 | eval_f_betas = []
89 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks,
90 | self.e_seg_ids, self.e_lab_ids):
91 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch)
92 |
93 | eval_losses.append(eval_loss)
94 |
95 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions,
96 | true_y=eval_batch["label_ids"],
97 | labels=self.label_list)
98 | eval_accs.append(acc)
99 | eval_recalls.append(recall)
100 | eval_precs.append(prec)
101 | eval_f_betas.append(f_beta)
102 | print("\n")
103 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format(
104 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls),
105 | mean(eval_precs), mean(eval_f_betas)))
106 | print("\n")
107 |
108 | if self.config["ckpt_model_path"]:
109 | save_path = self.config["ckpt_model_path"]
110 | if not os.path.exists(save_path):
111 | os.makedirs(save_path)
112 | model_save_path = os.path.join(save_path, self.config["model_name"])
113 | self.model.saver.save(sess, model_save_path, global_step=current_step)
114 |
115 | end = time.time()
116 | print("total train time: ", end - start)
117 |
118 |
119 | if __name__ == "__main__":
120 | # 读取用户在命令行输入的信息
121 | parser = argparse.ArgumentParser()
122 | parser.add_argument("--config_path", help="config path of model")
123 | args = parser.parse_args()
124 | trainer = Trainer(args)
125 | trainer.train()
126 |
--------------------------------------------------------------------------------
/albert_task/classifier_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import time
5 | import sys
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 |
9 | import tensorflow as tf
10 | from albert import modeling
11 | from model import AlbertClassifier
12 | from data_helper import TrainData
13 | from metrics import mean, get_multi_metrics
14 |
15 |
16 | class Trainer(object):
17 | def __init__(self, args):
18 | self.args = args
19 | with open(args.config_path, "r") as fr:
20 | self.config = json.load(fr)
21 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "albert_model.ckpt")
22 |
23 | # 加载数据集
24 | self.data_obj = self.load_data()
25 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, lab_to_idx = self.data_obj.gen_data(
26 | self.config["train_data"])
27 |
28 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, lab_to_idx = self.data_obj.gen_data(
29 | self.config["eval_data"], is_training=False)
30 | print("train data size: {}".format(len(self.t_lab_ids)))
31 | print("eval data size: {}".format(len(self.e_lab_ids)))
32 | self.label_list = [value for key, value in lab_to_idx.items()]
33 | print("label numbers: ", len(self.label_list))
34 |
35 | num_train_steps = int(
36 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"])
37 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
38 | # 初始化模型对象
39 | self.model = self.create_model(num_train_steps, num_warmup_steps)
40 |
41 | def load_data(self):
42 | """
43 | 创建数据对象
44 | :return:
45 | """
46 | # 生成训练集对象并生成训练数据
47 | data_obj = TrainData(self.config)
48 | return data_obj
49 |
50 | def create_model(self, num_train_step, num_warmup_step):
51 | """
52 | 根据config文件选择对应的模型,并初始化
53 | :return:
54 | """
55 | model = AlbertClassifier(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
56 | return model
57 |
58 | def train(self):
59 | with tf.Session() as sess:
60 | tvars = tf.trainable_variables()
61 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
62 | tvars, self.__bert_checkpoint_path)
63 | print("init bert model params")
64 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
65 | print("init bert model params done")
66 | sess.run(tf.variables_initializer(tf.global_variables()))
67 |
68 | current_step = 0
69 | start = time.time()
70 | for epoch in range(self.config["epochs"]):
71 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
72 |
73 | for batch in self.data_obj.next_batch(self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids):
74 | loss, predictions = self.model.train(sess, batch)
75 |
76 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batch["label_ids"],
77 | labels=self.label_list)
78 | print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
79 | current_step, loss, acc, recall, prec, f_beta))
80 |
81 | current_step += 1
82 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
83 |
84 | eval_losses = []
85 | eval_accs = []
86 | eval_aucs = []
87 | eval_recalls = []
88 | eval_precs = []
89 | eval_f_betas = []
90 | for eval_batch in self.data_obj.next_batch(self.e_in_ids, self.e_in_masks,
91 | self.e_seg_ids, self.e_lab_ids):
92 | eval_loss, eval_predictions = self.model.eval(sess, eval_batch)
93 |
94 | eval_losses.append(eval_loss)
95 |
96 | acc, recall, prec, f_beta = get_multi_metrics(pred_y=eval_predictions,
97 | true_y=eval_batch["label_ids"],
98 | labels=self.label_list)
99 | eval_accs.append(acc)
100 | eval_recalls.append(recall)
101 | eval_precs.append(prec)
102 | eval_f_betas.append(f_beta)
103 | print("\n")
104 | print("eval: loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}".format(
105 | mean(eval_losses), mean(eval_accs), mean(eval_aucs), mean(eval_recalls),
106 | mean(eval_precs), mean(eval_f_betas)))
107 | print("\n")
108 |
109 | if self.config["ckpt_model_path"]:
110 | save_path = self.config["ckpt_model_path"]
111 | if not os.path.exists(save_path):
112 | os.makedirs(save_path)
113 | model_save_path = os.path.join(save_path, self.config["model_name"])
114 | self.model.saver.save(sess, model_save_path, global_step=current_step)
115 |
116 | end = time.time()
117 | print("total train time: ", end - start)
118 |
119 |
120 | if __name__ == "__main__":
121 | # 读取用户在命令行输入的信息
122 | parser = argparse.ArgumentParser()
123 | parser.add_argument("--config_path", help="config path of model")
124 | args = parser.parse_args()
125 | trainer = Trainer(args)
126 | trainer.train()
127 |
128 |
--------------------------------------------------------------------------------
/bert_task/machine_reading_task/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append(os.path.dirname(os.getcwd()))
5 | import tensorflow as tf
6 |
7 | from bert import modeling
8 | from bert import optimization
9 |
10 |
11 | class BertMachineReading(object):
12 | def __init__(self, config, is_training=True, num_train_step=None, num_warmup_step=None):
13 | self.__bert_config_path = os.path.join(config["bert_model_path"], "bert_config.json")
14 |
15 | self.__is_training = is_training
16 | self.__num_train_step = num_train_step
17 | self.__num_warmup_step = num_warmup_step
18 |
19 | self.__max_length = config["max_length"]
20 | self.__learning_rate = config["learning_rate"]
21 |
22 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, self.__max_length], name='input_ids')
23 | self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, self.__max_length], name='input_mask')
24 | self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, self.__max_length], name='segment_ids')
25 | self.start_position = tf.placeholder(dtype=tf.int32, shape=[None], name="start_position")
26 | self.end_position = tf.placeholder(dtype=tf.int32, shape=[None], name="end_position")
27 |
28 | self.built_model()
29 | self.init_saver()
30 |
31 | def built_model(self):
32 | bert_config = modeling.BertConfig.from_json_file(self.__bert_config_path)
33 |
34 | model = modeling.BertModel(config=bert_config,
35 | is_training=self.__is_training,
36 | input_ids=self.input_ids,
37 | input_mask=self.input_masks,
38 | token_type_ids=self.segment_ids,
39 | use_one_hot_embeddings=False)
40 |
41 | final_hidden = model.get_sequence_output()
42 |
43 | final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
44 | seq_length = final_hidden_shape[1]
45 | hidden_size = final_hidden_shape[2]
46 |
47 | with tf.name_scope("output"):
48 | output_weights = tf.get_variable(
49 | "output_weights", [2, hidden_size],
50 | initializer=tf.truncated_normal_initializer(stddev=0.02))
51 |
52 | output_bias = tf.get_variable(
53 | "output_bias", [2], initializer=tf.zeros_initializer())
54 |
55 | final_hidden_matrix = tf.reshape(final_hidden,
56 | [-1, hidden_size])
57 | logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
58 | logits = tf.nn.bias_add(logits, output_bias)
59 |
60 | logits = tf.reshape(logits, [-1, seq_length, 2])
61 | logits = tf.transpose(logits, [2, 0, 1])
62 |
63 | unstacked_logits = tf.unstack(logits, axis=0)
64 |
65 | # [batch_size, seq_length]
66 | start_logits, end_logits = (unstacked_logits[0], unstacked_logits[1])
67 |
68 | self.start_logits = start_logits
69 | self.end_logits = end_logits
70 |
71 | if self.__is_training:
72 | with tf.name_scope("loss"):
73 | start_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=start_logits,
74 | labels=self.start_position)
75 | end_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=end_logits,
76 | labels=self.end_position)
77 |
78 | losses = tf.concat([start_losses, end_losses], axis=0)
79 | self.loss = tf.reduce_mean(losses, name="loss")
80 |
81 | with tf.name_scope('train_op'):
82 | self.train_op = optimization.create_optimizer(
83 | self.loss, self.__learning_rate, self.__num_train_step, self.__num_warmup_step, use_tpu=False)
84 |
85 | def init_saver(self):
86 | self.saver = tf.train.Saver(tf.global_variables())
87 |
88 | def train(self, sess, batch):
89 | """
90 | 训练模型
91 | :param sess: tf的会话对象
92 | :param batch: batch数据
93 | :return: 损失和预测结果
94 | """
95 |
96 | feed_dict = {self.input_ids: batch["input_ids"],
97 | self.input_masks: batch["input_masks"],
98 | self.segment_ids: batch["segment_ids"],
99 | self.start_position: batch["start_position"],
100 | self.end_position: batch["end_position"]}
101 |
102 | # 训练模型
103 | _, loss, start_logits, end_logits = sess.run([self.train_op, self.loss, self.start_logits, self.end_logits],
104 | feed_dict=feed_dict)
105 | return loss, start_logits, end_logits
106 |
107 | def eval(self, sess, batch):
108 | """
109 | 验证模型
110 | :param sess: tf中的会话对象
111 | :param batch: batch数据
112 | :return: 损失和预测结果
113 | """
114 | feed_dict = {self.input_ids: batch["input_ids"],
115 | self.input_masks: batch["input_masks"],
116 | self.segment_ids: batch["segment_ids"],
117 | self.start_position: batch["start_position"],
118 | self.end_position: batch["end_position"]}
119 |
120 | start_logits, end_logits = sess.run([self.start_logits, self.end_logits], feed_dict=feed_dict)
121 | return start_logits, end_logits
122 |
123 | def infer(self, sess, batch):
124 | """
125 | 预测新数据
126 | :param sess: tf中的会话对象
127 | :param batch: batch数据
128 | :return: 预测结果
129 | """
130 | feed_dict = {self.input_ids: batch["input_ids"],
131 | self.input_masks: batch["input_masks"],
132 | self.segment_ids: batch["segment_ids"]}
133 |
134 | start_logits, end_logits = sess.run([self.start_logits, self.end_logits], feed_dict=feed_dict)
135 |
136 | return start_logits, end_logits
137 |
--------------------------------------------------------------------------------
/bert_task/ner_task/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import time
5 | import sys
6 |
7 | sys.path.append(os.path.dirname(os.getcwd()))
8 | import tensorflow as tf
9 | from bert import modeling
10 | from model import BertNer
11 | from data_helper import TrainData
12 | from metrics import mean, gen_metrics
13 |
14 |
15 | class Trainer(object):
16 | def __init__(self, args):
17 | self.args = args
18 | with open(args.config_path, "r") as fr:
19 | self.config = json.load(fr)
20 | self.__bert_checkpoint_path = os.path.join(self.config["bert_model_path"], "bert_model.ckpt")
21 |
22 | # 加载数据集
23 | self.data_obj = self.load_data()
24 | self.t_in_ids, self.t_in_masks, self.t_seg_ids, self.t_lab_ids, self.t_seq_len, self.lab_to_idx = \
25 | self.data_obj.gen_data(self.config["train_data"])
26 |
27 | self.e_in_ids, self.e_in_masks, self.e_seg_ids, self.e_lab_ids, self.e_seq_len, self.lab_to_idx = \
28 | self.data_obj.gen_data(self.config["eval_data"], is_training=False)
29 |
30 | print("train data size: {}".format(len(self.t_lab_ids)))
31 | print("eval data size: {}".format(len(self.e_lab_ids)))
32 |
33 | num_train_steps = int(
34 | len(self.t_lab_ids) / self.config["batch_size"] * self.config["epochs"])
35 | num_warmup_steps = int(num_train_steps * self.config["warmup_rate"])
36 | # 初始化模型对象
37 | self.model = self.create_model(num_train_steps, num_warmup_steps)
38 |
39 | def load_data(self):
40 | """
41 | 创建数据对象
42 | :return:
43 | """
44 | # 生成训练集对象并生成训练数据
45 | data_obj = TrainData(self.config)
46 | return data_obj
47 |
48 | def create_model(self, num_train_step, num_warmup_step):
49 | """
50 | 根据config文件选择对应的模型,并初始化
51 | :return:
52 | """
53 | model = BertNer(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step)
54 | return model
55 |
56 | def train(self):
57 | with tf.Session() as sess:
58 | tvars = tf.trainable_variables()
59 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
60 | tvars, self.__bert_checkpoint_path)
61 | print("init bert model params")
62 | tf.train.init_from_checkpoint(self.__bert_checkpoint_path, assignment_map)
63 | print("init bert model params done")
64 | sess.run(tf.variables_initializer(tf.global_variables()))
65 |
66 | current_step = 0
67 | start = time.time()
68 | for epoch in range(self.config["epochs"]):
69 | print("----- Epoch {}/{} -----".format(epoch + 1, self.config["epochs"]))
70 |
71 | for batch in self.data_obj.next_batch(self.t_in_ids,
72 | self.t_in_masks,
73 | self.t_seg_ids,
74 | self.t_lab_ids,
75 | self.t_seq_len):
76 |
77 | loss, true_y, predictions = self.model.train(sess, batch, self.config["keep_prob"])
78 |
79 | f1, precision, recall = gen_metrics(pred_y=predictions, true_y=true_y,
80 | label_to_index=self.lab_to_idx)
81 | print("train: step: {}, loss: {}, recall: {}, precision: {}, f1: {}".format(
82 | current_step, loss, recall, precision, f1))
83 |
84 | current_step += 1
85 | if self.data_obj and current_step % self.config["checkpoint_every"] == 0:
86 |
87 | eval_losses = []
88 | eval_recalls = []
89 | eval_precisions = []
90 | eval_f1s = []
91 | for eval_batch in self.data_obj.next_batch(self.e_in_ids,
92 | self.e_in_masks,
93 | self.e_seg_ids,
94 | self.e_lab_ids,
95 | self.e_seq_len):
96 | eval_loss, eval_true_y, eval_predictions = self.model.eval(sess, eval_batch)
97 |
98 | eval_losses.append(eval_loss)
99 |
100 | f1, precision, recall = gen_metrics(pred_y=eval_predictions,
101 | true_y=eval_true_y,
102 | label_to_index=self.lab_to_idx)
103 | eval_recalls.append(recall)
104 | eval_precisions.append(precision)
105 | eval_f1s.append(f1)
106 | print("\n")
107 | print("eval: loss: {}, recall: {}, precision: {}, f1: {}".format(
108 | mean(eval_losses), mean(eval_recalls),
109 | mean(eval_precisions), mean(eval_f1s)))
110 | print("\n")
111 |
112 | if self.config["ckpt_model_path"]:
113 | save_path = self.config["ckpt_model_path"]
114 | if not os.path.exists(save_path):
115 | os.makedirs(save_path)
116 | model_save_path = os.path.join(save_path, self.config["model_name"])
117 | self.model.saver.save(sess, model_save_path, global_step=current_step)
118 |
119 | end = time.time()
120 | print("total train time: ", end - start)
121 |
122 |
123 | if __name__ == "__main__":
124 | # 读取用户在命令行输入的信息
125 | parser = argparse.ArgumentParser()
126 | parser.add_argument("--config_path", help="config path of model")
127 | args = parser.parse_args()
128 | trainer = Trainer(args)
129 | trainer.train()
130 |
--------------------------------------------------------------------------------