├── configs
    ├── qa
    │   ├── coqa.yml
    │   ├── squad.yml
    │   └── dureader_yesno.yml
    ├── 2020_LSTC
    │   ├── DuEE_keyphrase.yml
    │   ├── DuEE_trigger_as_classifier.yml
    │   ├── DuEE_role.yml
    │   └── DuEE_trigger.yml
    ├── glue_zh
    │   ├── csl.yml
    │   ├── afqmc.yml
    │   ├── iflytek.yml
    │   ├── cmnli.yml
    │   ├── wsc.yml
    │   ├── tnews_k_fold.yml
    │   └── tnews.yml
    ├── custom
    │   ├── ewn.yml
    │   ├── gov_title_trigger.yml
    │   ├── idiom_generator.yml
    │   └── gov_title_role.yml
    ├── pretrain
    │   ├── gpt.yml
    │   ├── xlnet.yml
    │   ├── distilbert_huggingface.yml
    │   └── xlnet_chinese.yml
    └── generation
    │   └── text_generation.yml
├── docs
    ├── requirements.txt
    ├── resource
    │   └── imgs
    │   │   ├── lr_find.jpg
    │   │   ├── aispace_logo.png
    │   │   ├── aispace_framework.png
    │   │   └── aispace_logo_name.png
    ├── source
    │   ├── dataset.md
    │   ├── model.md
    │   ├── configuration.md
    │   ├── index.rst
    │   ├── conf.py
    │   └── quickstart.md
    ├── Makefile
    └── make.bat
├── aispace
    ├── __init__.py
    ├── layers
    │   ├── pretrained
    │   │   ├── gpt.py
    │   │   ├── xlm.py
    │   │   ├── ctrl.py
    │   │   ├── roberta.py
    │   │   ├── distilbert.py
    │   │   ├── transformer_xl.py
    │   │   └── __init__.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   └── feed_forward_add_and_norm.py
    │   ├── qa_layers
    │   │   ├── __init__.py
    │   │   └── qa_simple.py
    │   ├── decoders
    │   │   ├── __init__.py
    │   │   └── crf.py
    │   ├── encoders
    │   │   ├── __init__.py
    │   │   └── rnn.py
    │   ├── attentions
    │   │   └── __init__.py
    │   ├── embeddings
    │   │   ├── __init__.py
    │   │   └── sharded_embedding.py
    │   ├── callbacks
    │   │   ├── print_lr.py
    │   │   ├── __init__.py
    │   │   └── lr_finder.py
    │   ├── base_layer.py
    │   ├── __init__.py
    │   ├── adapters
    │   │   └── __init__.py
    │   ├── losses
    │   │   ├── cross_entropy_loss.py
    │   │   ├── __init__.py
    │   │   └── focal_loss.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── f1_score.py
    │   │   ├── precision.py
    │   │   └── recall.py
    │   ├── activations.py
    │   └── optimizers
    │   │   ├── __init__.py
    │   │   ├── optimizer_wrapper.py
    │   │   └── lr_multiplier.py
    ├── utils
    │   ├── __init__.py
    │   ├── logger.py
    │   ├── tfrecord_utils.py
    │   ├── misc.py
    │   ├── print_utils.py
    │   ├── timer.py
    │   ├── file_utils.py
    │   ├── checkpoint_utils.py
    │   └── math_utils.py
    ├── models
    │   ├── question_answer
    │   │   ├── __init__.py
    │   │   ├── bento_services
    │   │   │   └── __init__.py
    │   │   └── bert_for_qa.py
    │   ├── generation
    │   │   ├── __init__.py
    │   │   └── bert_for_generation.py
    │   ├── pretrained
    │   │   ├── __init__.py
    │   │   └── bert_for_pretraining.py
    │   ├── classifications
    │   │   ├── bento_services
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── textcnn_for_sequence_classification.py
    │   │   └── bert_for_sequence_classification.py
    │   ├── __init__.py
    │   ├── info_extract
    │   │   ├── __init__.py
    │   │   └── bento_services
    │   │   │   └── __init__.py
    │   ├── base_model.py
    │   └── base_pretrained.py
    ├── datasets
    │   ├── url_checksums
    │   │   ├── dureader.txt
    │   │   ├── lstc_2020.txt
    │   │   └── glue_zh.txt
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   └── tokenizer_base.py
    │   ├── data_transformers
    │   │   ├── __init__.py
    │   │   ├── base_transformer.py
    │   │   ├── ewn_transformer.py
    │   │   └── idiom_transformer.py
    │   └── __init__.py
    └── constants.py
├── scrpts
    ├── stop_deploy.sh
    ├── stop_deploy_with_port.sh
    ├── start_training_csl.sh
    ├── start_training_wsc.sh
    ├── start_training_afqmc.sh
    ├── start_training_cmnli.sh
    ├── start_training_iflytek.sh
    ├── start_training_tnews.sh
    ├── start_training_drcd.sh
    ├── start_training_cmrc2018.sh
    ├── start_training_duee_trigger.sh
    ├── start_training_duee_role.sh
    ├── start_training_gov_title_trigger.sh
    ├── start_training_gov_title_role.sh
    ├── start_training_idiom_generator.sh
    ├── start_training_duee_keyphrase.sh
    ├── start_training_ewn.sh
    ├── start_training_dureader_yesno.sh
    ├── start_training_dureader_robust.sh
    ├── start_training_duee_role_as_qa.sh
    └── start_deploy_duee.sh
├── tests
    ├── dataset
    │   └── tokenizer
    │   │   ├── test_bert_tokenizer.py
    │   │   ├── test_gpt2_tokenizer.py
    │   │   └── test_xlnet_tokenizer.py
    ├── utils
    │   ├── test_io_utils.py
    │   └── test_eval_utils.py
    ├── test_k_fold_training.py
    ├── layers
    │   ├── layers
    │   │   ├── adapters
    │   │   │   └── test_gpt2_adapter.py
    │   │   └── pretrained
    │   │   │   ├── test_nezha.py
    │   │   │   ├── test_electra.py
    │   │   │   └── test_gpt2.py
    │   └── losses
    │   │   └── dice_loss.py
    ├── models
    │   └── generation
    │   │   └── test_bert_for_text_generation.py
    ├── test_hannlp.py
    ├── test_glue_zh.py
    ├── test_gov_title.py
    ├── test_dureader.py
    ├── test_idiom.py
    └── test_2020_lstc.py
├── .travis.yml
├── .readthedocs.yml
└── .gitignore


/configs/qa/coqa.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/configs/qa/squad.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | recommonmark
4 | sphinx_markdown_tables


--------------------------------------------------------------------------------
/docs/resource/imgs/lr_find.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingyuankai/AiSpace/HEAD/docs/resource/imgs/lr_find.jpg


--------------------------------------------------------------------------------
/docs/resource/imgs/aispace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingyuankai/AiSpace/HEAD/docs/resource/imgs/aispace_logo.png


--------------------------------------------------------------------------------
/docs/resource/imgs/aispace_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingyuankai/AiSpace/HEAD/docs/resource/imgs/aispace_framework.png


--------------------------------------------------------------------------------
/docs/resource/imgs/aispace_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yingyuankai/AiSpace/HEAD/docs/resource/imgs/aispace_logo_name.png


--------------------------------------------------------------------------------
/aispace/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding=utf-8
3 | # @Time    : 2019-07-27 22:57
4 | # @Author  : yingyuankai@aliyun.com
5 | # @File    : __init__.py.py


--------------------------------------------------------------------------------
/aispace/layers/pretrained/gpt.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-29 14:20
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : gpt.py


--------------------------------------------------------------------------------
/aispace/layers/pretrained/xlm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-29 14:21
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : xlm.py


--------------------------------------------------------------------------------
/aispace/layers/pretrained/ctrl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-29 14:19
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : ctrl.py


--------------------------------------------------------------------------------
/aispace/layers/pretrained/roberta.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-05 20:21
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : roberta.py


--------------------------------------------------------------------------------
/aispace/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-12-04 11:19
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | 


--------------------------------------------------------------------------------
/aispace/layers/pretrained/distilbert.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-29 14:19
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : distilbert.py


--------------------------------------------------------------------------------
/aispace/layers/pretrained/transformer_xl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-29 14:21
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : transformer_xl.py


--------------------------------------------------------------------------------
/aispace/models/question_answer/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding=utf-8
3 | # @Time    : 2020/4/25 18:07
4 | # @Author  : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .bert_for_qa import BertForQA


--------------------------------------------------------------------------------
/aispace/models/generation/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 1/12/21 3:00 PM
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .bert_for_generation import *


--------------------------------------------------------------------------------
/aispace/layers/fusions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-15 15:42
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .feed_forward_add_and_norm import FeedForwardAddAndNorm


--------------------------------------------------------------------------------
/aispace/layers/qa_layers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2020-07-09 10:16
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .qa_with_impossible import *
8 | from .qa_simple import *


--------------------------------------------------------------------------------
/aispace/layers/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-15 11:15
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .crf import CRFLayer
8 | from .pooler import SequenceSummary


--------------------------------------------------------------------------------
/aispace/layers/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-15 15:44
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .transformer import *
8 | from .rnn import *
9 | from .cnn import *


--------------------------------------------------------------------------------
/aispace/layers/attentions/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # coding=utf-8
3 | # @Time    : 2019-07-27 22:38
4 | # @Author  : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .multi_head_attention import MultiHeadAttention
8 | from .attention_family import *


--------------------------------------------------------------------------------
/scrpts/stop_deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | sp_pid=`ps -ef | grep 'bentoml serve' | grep -v grep | awk '{print $2}'`
 3 | 
 4 | if [ -z "$sp_pid" ];
 5 |  then
 6 |   echo "[ not find sp-tomcat pid ]"
 7 |  else
 8 |   echo "find result: $sp_pid "
 9 |   kill -9 $sp_pid
10 |  fi


--------------------------------------------------------------------------------
/scrpts/stop_deploy_with_port.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | sp_pid=`ps -ef | grep $1 | grep -v grep | awk '{print $2}'`
 3 | 
 4 | if [ -z "$sp_pid" ];
 5 |  then
 6 |   echo "[ not find sp-tomcat pid ]"
 7 |  else
 8 |   echo "find result: $sp_pid "
 9 |   kill -9 $sp_pid
10 |  fi


--------------------------------------------------------------------------------
/aispace/models/pretrained/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-10 16:48
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | 
8 | from aispace.models.pretrained.bert_for_pretraining import BertForPreTraining
9 | 


--------------------------------------------------------------------------------
/tests/dataset/tokenizer/test_bert_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # import unittest
 2 | #
 3 | #
 4 | # class TestBertTokenizer(unittest.TestCase):
 5 | #     def test_vocab_from_(self):
 6 | #         self.assertEqual(True, False)
 7 | #
 8 | #
 9 | # if __name__ == '__main__':
10 | #     unittest.main()
11 | 


--------------------------------------------------------------------------------
/aispace/models/classifications/bento_services/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-12-03 20:43
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | 
8 | from .bert_for_seq_classification_service import BertTextClassificationService


--------------------------------------------------------------------------------
/aispace/models/question_answer/bento_services/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2020-09-22 18:55
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | from .bert_for_qa_service import *
8 | from .bert_for_qa_with_impossible_service import *


--------------------------------------------------------------------------------
/aispace/models/classifications/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019-11-10 16:48
3 | # @Author  : yingyuankai
4 | # @Email   : yingyuankai@aliyun.com
5 | # @File    : __init__.py
6 | 
7 | 
8 | from .bert_for_sequence_classification import *
9 | from .textcnn_for_sequence_classification import *


--------------------------------------------------------------------------------
/aispace/datasets/url_checksums/dureader.txt:
--------------------------------------------------------------------------------
1 | https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz 20518631 99bed9ced8995df1c89b9789f890c27a13b4650a56b4d973907cc28da8bd9f0f
2 | https://dataset-bj.cdn.bcebos.com/qianyan/dureader_yesno-data.tar.gz 135450581 c6fdbc76771e0eb3c36de78017da64a1097992536d204e98eb5d7e32abe1c44d
3 | 


--------------------------------------------------------------------------------
/aispace/layers/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-27 22:26
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | from .albert_embedding import *
 8 | from .bert_embedding import *
 9 | from .sharded_embedding import *
10 | from .electra_embedding import *


--------------------------------------------------------------------------------
/aispace/layers/pretrained/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-05 17:13
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py.py
 6 | 
 7 | 
 8 | from .bert import *
 9 | from .albert import *
10 | from .xlnet import *
11 | from .electra import *
12 | from .gpt2 import *


--------------------------------------------------------------------------------
/scrpts/start_training_csl.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_classification \
 7 |     --schedule train_and_eval \
 8 |     --config_name csl \
 9 |     --config_dir ./configs/glue_zh \
10 |     --gpus 0 1 \
11 |     > csl_err.log 2>&1 &


--------------------------------------------------------------------------------
/scrpts/start_training_wsc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_relation_extract \
 7 |     --schedule train_and_eval \
 8 |     --config_name wsc \
 9 |     --config_dir ./configs/glue_zh \
10 |     --gpus 0 1 \
11 |     > wsc_err.log 2>&1 &


--------------------------------------------------------------------------------
/scrpts/start_training_afqmc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_classification \
 7 |     --schedule train_and_eval \
 8 |     --config_name afqmc \
 9 |     --config_dir ./configs/glue_zh \
10 |     --gpus 0 1 \
11 |     > afqmc_err.log 2>&1 &


--------------------------------------------------------------------------------
/scrpts/start_training_cmnli.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1,2
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_classification \
 7 |     --schedule train_and_eval \
 8 |     --config_name cmnli \
 9 |     --config_dir ./configs/glue_zh \
10 |     --gpus 0 1 \
11 |     > cmnli_err.log 2>&1 &


--------------------------------------------------------------------------------
/scrpts/start_training_iflytek.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1,2
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_classification \
 7 |     --schedule train_and_eval \
 8 |     --config_name iflytek \
 9 |     --config_dir ./configs/glue_zh \
10 |     --gpus 0 1 \
11 |     > iflytek_err.log 2>&1 &


--------------------------------------------------------------------------------
/aispace/datasets/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-10 16:50
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | from .bert_tokenizer import BertTokenizer
 8 | from .tokenizer_base import BaseTokenizer
 9 | from .xlnet_tokenizer import XlnetTokenizer
10 | from .gpt_tokenizer import CPMTokenizer


--------------------------------------------------------------------------------
/aispace/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-10 16:49
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | from .base_model import BaseModel
 8 | from .pretrained import *
 9 | from .classifications import *
10 | from .info_extract import *
11 | from .question_answer import *
12 | from .generation import *
13 | 


--------------------------------------------------------------------------------
/aispace/models/info_extract/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-11 19:32
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | 
 8 | from .bert_for_ner import BertForNer
 9 | from .bert_for_role_ner import BertForRoleNer
10 | from .bert_dgcnn_for_ner import BertDgcnnForNer
11 | from .bert_for_relation_extract import BertForRelationExtract


--------------------------------------------------------------------------------
/tests/utils/test_io_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | class TestIoUtils(unittest.TestCase):
 5 |     def test_load_sentencepiece_vocab(self):
 6 |         from aispace.utils.io_utils import load_vocab
 7 |         file_path = "/search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_base/spiece.model"
 8 |         vocab = load_vocab(file_path)
 9 |         self.assertIsInstance(vocab, list)
10 | 
11 | if __name__ == '__main__':
12 |     unittest.main()
13 | 


--------------------------------------------------------------------------------
/aispace/datasets/url_checksums/lstc_2020.txt:
--------------------------------------------------------------------------------
1 | https://dataset-bj.cdn.bcebos.com/event_extraction/dev_data.json.zip 268607 25a9f36ea1ff13df92c5e3ad390829183b8623cf8fa353bc5bf8ca37c93063a1
2 | https://dataset-bj.cdn.bcebos.com/event_extraction/test1_data.json.zip 217842 78721aa4371c62e94e9fe9b748a7797386d47a939ba42c9c1c00a10e8b98f307
3 | https://dataset-bj.cdn.bcebos.com/event_extraction/train_data.json.zip 1684681 bf80c245c6e3fd5349732a0129e87f7f657a3550faf651a0cbb95a6c4e7f14c1
4 | 


--------------------------------------------------------------------------------
/aispace/models/info_extract/bento_services/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-03 20:43
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | 
 8 | from .bert_for_ner_service import BertNerService
 9 | from .bert_for_role_ner_service import RoleBertNerService
10 | from .bert_for_relation_extract_service import BertRelationClassificationService
11 | from .bert_for_ner_with_title_status_service import BertNerWithTitleStatusService


--------------------------------------------------------------------------------
/aispace/datasets/data_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-01-10 15:41
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | 
 8 | from .clue_transformer import *
 9 | from .base_transformer import BaseTransformer
10 | from .duee_transformer import DuEETriggerTransformer
11 | from .dureader_transformer import *
12 | from .ewn_transformer import *
13 | from .gov_title_transformer import *
14 | from .idiom_transformer import *


--------------------------------------------------------------------------------
/docs/source/dataset.md:
--------------------------------------------------------------------------------
 1 | # Dataset
 2 | 
 3 | ## BaseDataset
 4 | 
 5 | The base class ***BaseDataset*** inherits the ***tfds.core.GeneratorBasedBuilder*** and ***Registry***, which makes subclasses registerable.
 6 | 
 7 | ## Custom dataset
 8 | 
 9 | Take the glue_zh dataset as an example as following:
10 | 
11 | ```python
12 | @BaseDataset.register("glue_zh")
13 | class GlueZh(BaseDataset):
14 |     ...
15 | ```
16 | The development follows ***[tensorflow_dataset's](https://www.tensorflow.org/datasets)*** specification.
17 | 
18 | 


--------------------------------------------------------------------------------
/aispace/layers/callbacks/print_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-15 21:09
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : print_lr.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | # Callback for printing the LR at the end of each epoch.
10 | # class PrintLR(tf.keras.callbacks.Callback):
11 | #   def on_epoch_end(self, epoch, logs=None):
12 | #     print('\nLearning rate for epoch {} is {}'.format(epoch + 1,
13 | #                                                       model.optimizer.lr.numpy()))


--------------------------------------------------------------------------------
/aispace/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-09 15:19
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | from aispace.datasets.base_dataset import *
 8 | from aispace.datasets.tokenizer import *
 9 | from aispace.datasets.data_transformers import *
10 | from aispace.datasets import glue_zh
11 | from aispace.datasets import lstc_2020
12 | from aispace.datasets import dureader
13 | from aispace.datasets import entity_with_nationality
14 | from aispace.datasets import gov_title
15 | from aispace.datasets import idiom
16 | 


--------------------------------------------------------------------------------
/aispace/datasets/data_transformers/base_transformer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-01-10 15:36
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : base_transformer.py
 6 | 
 7 | from aispace.utils.registry import Registry
 8 | 
 9 | 
10 | class BaseTransformer(Registry):
11 |     def __init__(self, hparams, **kwargs):
12 |         self._hparams = hparams
13 | 
14 |     def transform(self, *args, **kwargs):
15 |         raise NotImplementedError
16 | 
17 |     def prepare_labels(self, *args, **kwargs) -> dict:
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/tests/test_k_fold_training.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-06-11 20:03
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_k_fold_training.py
 6 | 
 7 | 
 8 | import unittest
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.trainer import k_fold_experiment
12 | 
13 | 
14 | class TestKFoldTraining(unittest.TestCase):
15 |     def test_dataset_split(self):
16 |         hparams = Hparams()
17 |         hparams.load_from_config_file("../configs/glue_zh/tnews.yml")
18 |         hparams.stand_by()
19 | 
20 |         k_fold_experiment(hparams)
21 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"      # current default Python on Travis CI
 4 | 
 5 | cache: pip
 6 | 
 7 | # command to install dependencies
 8 | install:
 9 |   - "pip install --upgrade pip"
10 |   - "pip install --ignore-installed setuptools>=41.0.0"
11 |   - "pip install --ignore-installed six>=1.12.0"
12 |   - "pip install --ignore-installed python-dateutil==2.8.0"
13 |   - "pip install --ignore-installed tensorboard>=2.1.0"
14 |   - "pip install --ignore-installed tensorflow-estimator>=2.1.0"
15 |   - "pip install -r requirements.txt"
16 | 
17 | # command to run tests
18 | script:
19 |   - pytest
20 | 
21 | 


--------------------------------------------------------------------------------
/aispace/layers/base_layer.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-05 10:27
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : base_model.py
 6 | 
 7 | __all__ = [
 8 |     "BaseLayer"
 9 | ]
10 | 
11 | from abc import ABCMeta, abstractmethod
12 | import tensorflow as tf
13 | 
14 | from aispace.utils.hparams import Hparams
15 | from aispace.utils.registry import Registry
16 | 
17 | 
18 | class BaseLayer(tf.keras.layers.Layer, Registry):
19 |     __metaclass__ = ABCMeta
20 | 
21 |     def __init__(self, hparams: Hparams, **kwargs):
22 |         super(BaseLayer, self).__init__(**kwargs)
23 |         self._hparams = hparams


--------------------------------------------------------------------------------
/scrpts/start_training_tnews.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_classification \
 7 |     --schedule train_and_eval \
 8 |     --config_name tnews \
 9 |     --config_dir ./configs/glue_zh \
10 |     --gpus 0 \
11 |     > tnew_err.log 2>&1 &
12 | 
13 | 
14 | #nohup python -u aispace/trainer.py \
15 | #    --experiment_name test \
16 | #    --model_name textcnn_for_classification \
17 | #    --schedule train_and_eval \
18 | #    --config_name tnews_k_fold \
19 | #    --config_dir ./configs/glue_zh \
20 | #    --gpus 0 1 \
21 | #    > err.log 2>&1 &


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |   version: 3.6
22 |   install:
23 |     - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/docs/source/model.md:
--------------------------------------------------------------------------------
 1 | # Model
 2 | 
 3 | ## BaseModel
 4 | 
 5 | The base class ***BaseModel*** inherits the ***tf.keras.Model*** and ***Registry***, which makes subclasses registerable.
 6 | It also implements deploy method for helping generate deployment files.
 7 | 
 8 | ## Custom Models
 9 | 
10 | Take the bert_for_classification model as an example as following:
11 | 
12 | ```python
13 | @BaseModel.register("bert_for_classification")
14 | class BertForSeqClassification(BaseModel):
15 |     ...
16 | ```
17 | 
18 | The registered name of the model BertForSeqClassification is bert_for_classification.
19 | And the implementation of other functions follows ***tf.keras.Model's*** specification.


--------------------------------------------------------------------------------
/scrpts/start_training_drcd.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_qa \
 7 |     --schedule train_and_eval \
 8 |     --enable_xla False \
 9 |     --config_name drcd \
10 |     --config_dir ./configs/glue_zh \
11 |     --gpus 0 1 \
12 |     > drcd_err.log 2>&1 &
13 | 
14 | 
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name textcnn_for_classification \
18 | #    --schedule train_and_eval \
19 | #    --config_name tnews_k_fold \
20 | #    --config_dir ./configs/glue_zh \
21 | #    --gpus 0 1 \
22 | #    > drcd_err.log 2>&1 &


--------------------------------------------------------------------------------
/scrpts/start_training_cmrc2018.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_qa \
 7 |     --schedule train_and_eval \
 8 |     --enable_xla False \
 9 |     --config_name cmrc2018 \
10 |     --config_dir ./configs/glue_zh \
11 |     --gpus 0 1 2 3 \
12 |     > cmrc2018_err.log 2>&1 &
13 | 
14 | 
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name textcnn_for_classification \
18 | #    --schedule train_and_eval \
19 | #    --config_name tnews_k_fold \
20 | #    --config_dir ./configs/glue_zh \
21 | #    --gpus 0 1 \
22 | #    > err.log 2>&1 &


--------------------------------------------------------------------------------
/aispace/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-09 15:20
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | from aispace.layers.base_layer import BaseLayer
 8 | from aispace.layers import losses
 9 | from aispace.layers import metrics
10 | from aispace.layers import optimizers
11 | from aispace.layers import pretrained
12 | from aispace.layers import fusions
13 | from aispace.layers import attentions
14 | from aispace.layers import callbacks
15 | from aispace.layers import encoders
16 | from aispace.layers import decoders
17 | from aispace.layers.activations import ACT2FN
18 | from aispace.layers import adapters
19 | from aispace.layers import qa_layers
20 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/aispace/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-03 14:04
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : logger.py
 6 | 
 7 | import os
 8 | 
 9 | 
10 | def setup_logging(log_folder, logging_config):
11 |     import logging.config
12 |     # update logging filename
13 |     info_filename = logging_config.handlers.info_file_handler.filename
14 |     error_filename = logging_config.handlers.error_file_handler.filename
15 |     logging_config.handlers.info_file_handler.filename = os.path.join(log_folder, info_filename)
16 |     logging_config.handlers.error_file_handler.filename = os.path.join(log_folder, error_filename)
17 |     # setup logging config
18 |     logging.config.dictConfig(logging_config)


--------------------------------------------------------------------------------
/aispace/utils/tfrecord_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-02-22 16:49
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : tfrecord_utils.py
 6 | 
 7 | from __future__ import absolute_import
 8 | from __future__ import division
 9 | from __future__ import print_function
10 | 
11 | import tensorflow as tf
12 | 
13 | 
14 | def to_int64_feature(value):
15 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=value if isinstance(value, list) else [value]))
16 | 
17 | 
18 | def to_bytes_feature(value):
19 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
20 | 
21 | 
22 | def make_example(feature):
23 |     example = tf.train.Example(features=tf.train.Features(feature=feature))
24 |     return example.SerializeToString()
25 | 


--------------------------------------------------------------------------------
/tests/utils/test_eval_utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2020/4/25 19:44
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : test_eval_utils.py
 6 | 
 7 | import unittest
 8 | from aispace.utils.hparams import Hparams
 9 | from aispace.utils.eval_utils import evaluation
10 | 
11 | 
12 | class TestEvalUtils(unittest.TestCase):
13 |     def test_eval(self):
14 |         hparams = Hparams()
15 |         hparams.load_from_config_file("../../configs/glue_zh/tnews_k_fold.yml")
16 |         hparams.stand_by()
17 |         ckpts = [
18 |             "../../save/test_textcnn_for_classification_119_14/k_fold/1/model_saved/model",
19 |             "../../save/test_textcnn_for_classification_119_14/k_fold/2/model_saved/model",
20 |         ]
21 |         evaluation(hparams, checkpoints=ckpts)
22 | 


--------------------------------------------------------------------------------
/docs/source/configuration.md:
--------------------------------------------------------------------------------
 1 | # Configuration
 2 | 
 3 | We use yaml to manage various configurations, and inheritance and override can be implemented between configurations
 4 | The configurations of specific task inherit base configuration directly or indirectly.
 5 | ## Base
 6 | ```text
 7 | configs/base.yml
 8 | ```
 9 | This is the most basic configuration file, including the default configuration about training, logging, etc.
10 | 
11 | You can read this configuration carefully to understand the possibility of configurability.
12 | 
13 | ## Pretrain
14 | 
15 | ```text
16 | configs/pretrain
17 | ```
18 | This kind of configuration file adds pretrained item compared to other configurations mainly, and includes ***base.yml***.
19 | 
20 | ## Specific task
21 | 
22 | For example:
23 | ```text
24 | configs/glue_zh/tnews.yml
25 | ```


--------------------------------------------------------------------------------
/aispace/layers/adapters/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-28 15:26
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py.py
 6 | 
 7 | __all__ = [
 8 |     "ADAPTERS"
 9 | ]
10 | 
11 | from .model_adapters import *
12 | 
13 | 
14 | ADAPTERS = {
15 |     "tf_huggingface_bert_adapter": tf_huggingface_bert_adapter,
16 |     "tf_huggingface_ernie_adapter": tf_huggingface_ernie_adapter,
17 |     "tf_huggingface_xlnet_adapter": tf_huggingface_xlnet_adapter,
18 |     "tf_huggingface_albert_chinese_adapter": tf_huggingface_albert_chinese_adapter,
19 |     "tf_huggingface_albert_chinese_google_adapter": tf_huggingface_albert_chinese_google_adapter,
20 |     "tf_huggingface_electra_adapter": tf_huggingface_electra_adapter,
21 |     "tf_huggingface_gpt2_adapter": tf_huggingface_gpt2_adapter
22 | }


--------------------------------------------------------------------------------
/aispace/models/base_model.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-05 10:27
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : base_model.py
 6 | 
 7 | from abc import ABCMeta, abstractmethod
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.utils.registry import Registry
12 | 
13 | 
14 | __all__ = [
15 |     "BaseModel"
16 | ]
17 | 
18 | 
19 | class BaseModel(tf.keras.Model, Registry):
20 |     __metaclass__ = ABCMeta
21 | 
22 |     def __init__(self, hparams: Hparams, **kwargs):
23 |         super(BaseModel, self).__init__(**kwargs)
24 |         self._hparams = hparams
25 | 
26 |     @abstractmethod
27 |     def call(self, inputs, training=None, mask=None):
28 |         raise NotImplementedError
29 | 
30 |     @abstractmethod
31 |     def deploy(self):
32 |         raise NotImplementedError


--------------------------------------------------------------------------------
/tests/dataset/tokenizer/test_gpt2_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | # # @Time    : 2020-06-02 16:17
 3 | # # @Author  : yingyuankai
 4 | # # @Email   : yingyuankai@aliyun.com
 5 | # # @File    : test_xlnet_tokenizer.py
 6 | 
 7 | 
 8 | import unittest
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | 
12 | from aispace.datasets.tokenizer import CPMTokenizer
13 | 
14 | 
15 | class TestXlnetTokenizer(unittest.TestCase):
16 | 
17 |     def test_init(self):
18 |         hparams = Hparams()
19 |         hparams.load_from_config_file("../../../configs/custom/test_gpt2.yml")
20 |         hparams.stand_by()
21 |         tokenizer = CPMTokenizer(hparams.dataset.tokenizer)
22 | 
23 |         a = "这两天，XLNet貌似也引起了NLP圈的极大关注，从实验数据看，在某些场景下，确实XLNet相对Bert有很大幅度的提升。"
24 |         b = "就像我们之前说的，感觉Bert打开两阶段模式的魔法盒开关后，在这条路上，会有越来越多的同行者，而XLNet就是其中比较引人注目的一位"
25 | 
26 |         res = tokenizer.encode(a, b)
27 |         print(res)


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/aispace/layers/losses/cross_entropy_loss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-04-27 20:27
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : cross_entropy_loss.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | class SigmoidCrossEntropy(tf.keras.losses.Loss):
11 |     def __init__(self,
12 |                  reduction=tf.keras.losses.Reduction.NONE,
13 |                  name="sigmoid_cross_entropy"):
14 |         super(SigmoidCrossEntropy, self).__init__(name=name, reduction=reduction)
15 |         self.reduction = reduction
16 | 
17 |     def call(self, y_true, y_pred):
18 |         y_true = tf.cast(y_true, tf.float32)
19 |         cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(y_true, y_pred)
20 |         if self.reduction == tf.keras.losses.Reduction.SUM:
21 |             loss = tf.reduce_sum(cross_ent)
22 |         else:
23 |             loss = tf.reduce_mean(cross_ent)
24 |         return loss


--------------------------------------------------------------------------------
/tests/dataset/tokenizer/test_xlnet_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | # # @Time    : 2020-06-02 16:17
 3 | # # @Author  : yingyuankai
 4 | # # @Email   : yingyuankai@aliyun.com
 5 | # # @File    : test_xlnet_tokenizer.py
 6 | #
 7 | #
 8 | # import unittest
 9 | #
10 | # from aispace.utils.hparams import Hparams
11 | #
12 | # from aispace.datasets.tokenizer import XlnetTokenizer
13 | #
14 | #
15 | # class TestXlnetTokenizer(unittest.TestCase):
16 | #
17 | #     def test_init(self):
18 | #         hparams = Hparams()
19 | #         hparams.load_from_config_file("../../../configs/glue_zh/tnews.yml")
20 | #         hparams.stand_by()
21 | #         tokenizer = XlnetTokenizer(hparams.dataset.tokenizer)
22 | #
23 | #         a = "这两天，XLNet貌似也引起了NLP圈的极大关注，从实验数据看，在某些场景下，确实XLNet相对Bert有很大幅度的提升。"
24 | #         b = "就像我们之前说的，感觉Bert打开两阶段模式的魔法盒开关后，在这条路上，会有越来越多的同行者，而XLNet就是其中比较引人注目的一位"
25 | #
26 | #         res = tokenizer.encode(a, b)
27 | #         print(res)


--------------------------------------------------------------------------------
/tests/layers/layers/adapters/test_gpt2_adapter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 1/6/21 3:11 PM
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_gpt2_adapter.py
 6 | 
 7 | import unittest
 8 | 
 9 | from aispace.layers.adapters import tf_huggingface_gpt2_adapter
10 | from aispace.utils.hparams import Hparams
11 | from aispace.utils.builder_utils import build_model
12 | 
13 | 
14 | class TestGptAdapter(unittest.TestCase):
15 |     def test_process(self):
16 |         hparam = Hparams()
17 |         hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/test_gpt2.yml')
18 |         hparam.stand_by()
19 |         model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
20 |         model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
21 |         model_vars = model.trainable_variables
22 |         model_path = "/search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2"
23 |         tf_huggingface_gpt2_adapter(model_vars, model_path)


--------------------------------------------------------------------------------
/scrpts/start_training_duee_trigger.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #export HANLP_GREEDY_GPU=1
 4 | #export HANLP_HOME="/search/data1/yyk/data/hanlp_data"
 5 | #export TF_ENABLE_AUTO_MIXED_PRECISION=1
 6 | export CUDA_VISIBLE_DEVICES=0,1
 7 | nohup python -u aispace/trainer.py \
 8 |     --experiment_name trigger \
 9 |     --model_name bert_dgcnn_for_ner \
10 |     --schedule train_and_eval \
11 |     --config_name DuEE_trigger \
12 |     --config_dir ./configs/2020_LSTC \
13 |     --random_seed 91 \
14 |     --use_mixed_float16 True \
15 |     --gpus 0 1 \
16 |     > duee_trigger.log 2>&1 &
17 | 
18 | 
19 | #export CUDA_VISIBLE_DEVICES=7
20 | #nohup python -u aispace/trainer.py \
21 | #    --experiment_name trigger \
22 | #    --model_name bert_for_ner \
23 | #    --model_resume_path /search/data1/yyk/workspace/projects/AiSpace/save/trigger_bert_for_ner_119_20 \
24 | #    --schedule deploy \
25 | #    --config_name DuEE_trigger \
26 | #    --config_dir ./configs/2020_LSTC \
27 | #    --gpus 0 \
28 | #    > trigger_deploy_err.log 2>&1 &
29 | 


--------------------------------------------------------------------------------
/aispace/datasets/url_checksums/glue_zh.txt:
--------------------------------------------------------------------------------
1 | https://storage.googleapis.com/cluebenchmark/tasks/afqmc_public.zip 1195044 5a4cb1556b833010c329fa2ad2207d9e98fc94071b7e474015e9dd7c385db4dc
2 | https://storage.googleapis.com/cluebenchmark/tasks/cmnli_public.zip 31542894 8aef010738a536f920bbab20b754118f46d917d35e30a69bf6a4e631a72a6c1e
3 | https://storage.googleapis.com/cluebenchmark/tasks/cmrc2018_public.zip 3405146 6c63dc27e728ec5231aeb7d2861b4c90b6c116390582e0c44416cf3edf030b16
4 | https://storage.googleapis.com/cluebenchmark/tasks/csl_public.zip 3234594 795d1a2e475d59acad8236f6c5baba7a0b43d3e0508cb60f15ffbc76d5f437c4
5 | https://storage.googleapis.com/cluebenchmark/tasks/drcd_public.zip 7264200 f03a38bded37572e224b69b822794eca6218f9584afc0918bf8aa2bc77cf968d
6 | https://storage.googleapis.com/cluebenchmark/tasks/iflytek_public.zip 6505938 c59b961b29f1d0bad0c5e01aa62e4a61a80e9cfb980ce89b06c000851fbb3b06
7 | https://storage.googleapis.com/cluebenchmark/tasks/tnews_public.zip 4689325 2469c4205606e24118c7de08199fbd55da483b65128e1d9c1f380849797f6ce0
8 | https://storage.googleapis.com/cluebenchmark/tasks/wsc_public.zip 36181 fc55ce6e5f619de1670ea4201ceace5cf6e59d3193228ca75287a836ebc57b8c
9 | 


--------------------------------------------------------------------------------
/aispace/layers/encoders/rnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-29 11:19
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : rnn.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | __all__ = [
10 |     "Bilstm"
11 | ]
12 | 
13 | 
14 | class Bilstm(tf.keras.layers.Layer):
15 |     def __init__(self, units, dropout, **kwargs):
16 |         super(Bilstm, self).__init__(**kwargs)
17 |         fwd_lstm = tf.keras.layers.LSTM(
18 |             units,
19 |             return_sequences=True,
20 |             go_backwards=False,
21 |             dropout=dropout,
22 |             name="fwd_lstm")
23 |         bwd_lstm = tf.keras.layers.LSTM(
24 |             units,
25 |             return_sequences=True,
26 |             go_backwards=True,
27 |             dropout=dropout,
28 |             name="bwd_lstm")
29 |         self.bilstm = tf.keras.layers.Bidirectional(
30 |             merge_mode="concat",
31 |             layer=fwd_lstm,
32 |             backward_layer=bwd_lstm,
33 |             name="bilstm")
34 | 
35 |     def call(self, inputs, **kwargs):
36 |         outputs = self.bilstm(inputs, training=kwargs.get('training', False))
37 |         return outputs


--------------------------------------------------------------------------------
/aispace/layers/callbacks/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-15 15:40
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from prettytable import PrettyTable
10 | 
11 | from aispace.layers.callbacks.lr_finder import LRFinder
12 | from aispace.layers.callbacks.qa_evaluators import EvaluatorForQaWithImpossible, EvaluatorForQaSimple
13 | 
14 | __all__ = [
15 |     "CALLBACKS"
16 | ]
17 | 
18 | CALLBACKS = {
19 |     'early_stopping':
20 |         lambda config: tf.keras.callbacks.EarlyStopping(**config),
21 |     'checkpoint':
22 |         lambda config: tf.keras.callbacks.ModelCheckpoint(**config),
23 |     'tensorboard':
24 |         lambda config: tf.keras.callbacks.TensorBoard(**config),
25 |     'lr_finder': lambda config: LRFinder(**config),
26 |     'evaluator_for_qa_with_impossible':
27 |         lambda config: EvaluatorForQaWithImpossible(**config),
28 |     'evaluator_for_qa_simple':
29 |         lambda config: EvaluatorForQaSimple(**config)
30 | }
31 | 
32 | 
33 | def print_available():
34 |     table = PrettyTable(["NAME"])
35 |     for key in CALLBACKS:
36 |         table.add_row([key])
37 |     print()
38 |     print(table)


--------------------------------------------------------------------------------
/scrpts/start_training_duee_role.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1,0
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name role \
 6 |     --model_name bert_for_role_ner \
 7 |     --schedule train_and_eval \
 8 |     --config_name DuEE_role \
 9 |     --config_dir ./configs/2020_LSTC \
10 |     --gpus 0 1 \
11 |     > duee_role.log 2>&1 &
12 | 
13 | 
14 | # Build deployment package
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name bert_for_ner \
18 | #    --schedule deploy \
19 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/role_bert_for_role_ner_lstc_2020__DuEE_role_119_40 \
20 | #    --config_name DuEE_role \
21 | #    --config_dir ./configs/2020_LSTC \
22 | #    --gpus 0 \
23 | #    > err.log 2>&1 &
24 | 
25 | # Deploy using bentoml
26 | #DEPLOY_PATH=" /search/odin/yyk/workspace/AiSpace/save/role_bert_for_role_ner_lstc_2020__DuEE_role_119_40/deploy/RoleBertNerService/20210419143708_4EA245"
27 | #DEPLOY_MODE="serve-gunicorn"
28 | ##DEPLOY_MODE="serve"
29 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5001 --debug --enable-microbatch --workers 1 > event_role_deploy.log 2>&1 &
30 | #echo "Start dureader_robust service."


--------------------------------------------------------------------------------
/scrpts/start_training_gov_title_trigger.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_ner \
 7 |     --schedule train_and_eval \
 8 |     --config_name gov_title_trigger \
 9 |     --config_dir ./configs/custom \
10 |     --gpus 0 \
11 |     > gov_title_trigger.log 2>&1 &
12 | 
13 | 
14 | # Build deployment package
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name bert_for_ner \
18 | #    --schedule deploy \
19 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/test_bert_for_ner_gov_title__trigger_119_28 \
20 | #    --config_name gov_title_trigger \
21 | #    --config_dir ./configs/custom \
22 | #    --gpus 0 \
23 | #    > err.log 2>&1 &
24 | #
25 | ## Deploy using bentoml
26 | #DEPLOY_PATH="/search/odin/yyk/workspace/AiSpace/save/test_bert_for_ner_gov_title__trigger_119_28/deploy/BertNerService/20210111101630_08EEF3"
27 | #DEPLOY_MODE="serve-gunicorn"
28 | ##DEPLOY_MODE="serve"
29 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=1 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5000 --debug --enable-microbatch --workers 1 > gov_title_trigger_deploy.log 2>&1 &
30 | #echo "Start gov_title_trigger service."


--------------------------------------------------------------------------------
/scrpts/start_training_gov_title_role.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_ner \
 7 |     --schedule train_and_eval \
 8 |     --config_name gov_title_role \
 9 |     --config_dir ./configs/custom \
10 |     --gpus 0 \
11 |     > gov_title_role.log 2>&1 &
12 | 
13 | 
14 | # Build deployment package
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name bert_for_ner \
18 | #    --schedule deploy \
19 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/test_bert_for_ner_gov_title__role_119_6 \
20 | #    --config_name gov_title_role \
21 | #    --config_dir ./configs/custom \
22 | #    --gpus 0 \
23 | #    > err.log 2>&1 &
24 | 
25 | # Deploy using bentoml
26 | #DEPLOY_PATH="/search/odin/yyk/workspace/AiSpace/save/test_bert_for_ner_gov_title__role_119_6/deploy/BertNerWithTitleStatusService/20210218111157_16691B"
27 | #DEPLOY_MODE="serve-gunicorn"
28 | ##DEPLOY_MODE="serve"
29 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5001 --debug --enable-microbatch --workers 1 > title_role_deploy.log 2>&1 &
30 | #echo "Start dureader_robust service."
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/scrpts/start_training_idiom_generator.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name test \
 6 |     --model_name bert_for_text_generation \
 7 |     --schedule train_and_eval \
 8 |     --config_name idiom_generator \
 9 |     --config_dir ./configs/custom \
10 |     --gpus 0 \
11 |     > idiom_generator.log 2>&1 &
12 | 
13 | 
14 | # Build deployment package
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name bert_for_ner \
18 | #    --schedule deploy \
19 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/test_bert_for_ner_gov_title__trigger_119_28 \
20 | #    --config_name gov_title_trigger \
21 | #    --config_dir ./configs/custom \
22 | #    --gpus 0 \
23 | #    > err.log 2>&1 &
24 | #
25 | ## Deploy using bentoml
26 | #DEPLOY_PATH="/search/odin/yyk/workspace/AiSpace/save/test_bert_for_ner_gov_title__trigger_119_28/deploy/BertNerService/20210111101630_08EEF3"
27 | #DEPLOY_MODE="serve-gunicorn"
28 | ##DEPLOY_MODE="serve"
29 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=1 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5000 --debug --enable-microbatch --workers 1 > gov_title_trigger_deploy.log 2>&1 &
30 | #echo "Start gov_title_trigger service."


--------------------------------------------------------------------------------
/tests/layers/layers/pretrained/test_nezha.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | # # @Time    : 2020-06-03 15:51
 3 | # # @Author  : yingyuankai
 4 | # # @Email   : yingyuankai@aliyun.com
 5 | # # @File    : test_electra.py
 6 | #
 7 | import unittest
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.utils.builder_utils import build_model
12 | 
13 | 
14 | class TestElectra(unittest.TestCase):
15 |     def test_electra_checkpoint(self):
16 |         hparam = Hparams()
17 |         hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml')
18 |         hparam.stand_by()
19 | 
20 |         # ckpt = "/search/data1/yyk/workspace/projects/ERNIE/ernie/checkpoints"
21 |         # ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1]
22 |         ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path)
23 |                      if itm[0].find('adam') == -1 and not itm[0].endswith("lamb_m") and not itm[0].endswith("lamb_v")]
24 | 
25 |         model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
26 |         model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
27 | 
28 |         model_vars = model.trainable_variables
29 | 
30 |         print()


--------------------------------------------------------------------------------
/scrpts/start_training_duee_keyphrase.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1,0
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name keyphrase \
 6 |     --model_name bert_for_ner \
 7 |     --schedule train_and_eval \
 8 |     --config_name DuEE_keyphrase \
 9 |     --config_dir ./configs/2020_LSTC \
10 |     --gpus 0 1 \
11 |     > duee_keyphrase.log 2>&1 &
12 | 
13 | 
14 | # Build deployment package
15 | #nohup python -u aispace/trainer.py \
16 | #    --experiment_name test \
17 | #    --model_name bert_for_ner \
18 | #    --schedule deploy \
19 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/keyphrase_bert_for_ner_lstc_2020__DuEE_role_119_13 \
20 | #    --config_name DuEE_role \
21 | #    --config_dir ./configs/2020_LSTC \
22 | #    --gpus 0 \
23 | #    > err.log 2>&1 &
24 | 
25 | # Deploy using bentoml
26 | #DEPLOY_PATH=" /search/odin/yyk/workspace/AiSpace/save/keyphrase_bert_for_ner_lstc_2020__DuEE_role_119_13/deploy/BertNerService/20210701171400_6DB6C2"
27 | #DEPLOY_MODE="serve-gunicorn"
28 | ##DEPLOY_MODE="serve"
29 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5004 --debug --enable-microbatch --workers 1 > event_keyphrase_deploy.log 2>&1 &
30 | #echo "Start event_keyphrase service."


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. AiSpace documentation master file, created by
 2 |    sphinx-quickstart on Sun Feb  2 16:11:47 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | AiSpace
 7 | ===================================
 8 | 
 9 | AiSpace provides highly configurable framework for deep learning model development, deployment and
10 | conveniently use of pre-trained models (bert, albert, opt, etc.).
11 | 
12 | Features
13 | -----------------
14 | 
15 | * Highly configurable, we manage all hyperparameters with inheritable Configuration files.
16 | * All modules are registerable, including models, dataset, losses, optimizers, metrics, callbacks, etc.
17 | * Standardized process
18 | * Multi-GPU Training
19 | * Integrate multiple pre-trained models, including chinese
20 | * Simple and fast deployment using `BentoML <https://github.com/bentoml/BentoML>`_
21 | * Integrated Chinese benchmarks `CLUE <https://github.com/CLUEbenchmark/CLUE>`_
22 | 
23 | .. toctree::
24 |    :maxdepth: 2
25 |    :caption: Notes
26 | 
27 |    quickstart
28 |    configuration
29 |    dataset
30 |    model
31 |    deployment
32 |    examples
33 | 
34 | 
35 | 
36 | Indices and tables
37 | ==================
38 | 
39 | * :ref:`genindex`
40 | * :ref:`modindex`
41 | * :ref:`search`
42 | 


--------------------------------------------------------------------------------
/scrpts/start_training_ewn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #export CUDA_VISIBLE_DEVICES=1
 4 | #nohup python -u aispace/trainer.py \
 5 | #    --experiment_name test \
 6 | #    --model_name bert_for_classification \
 7 | #    --schedule train_and_eval \
 8 | #    --config_name ewn \
 9 | #    --config_dir ./configs/custom \
10 | #    --gpus 0 \
11 | #    > ewn_err.log 2>&1 &
12 | 
13 | # Build deployment package
14 | #nohup python -u aispace/trainer.py \
15 | #    --experiment_name test \
16 | #    --model_name bert_for_classification \
17 | #    --schedule deploy \
18 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/test_bert_for_classification_entity_with_nationality_119_14 \
19 | #    --config_name ewn \
20 | #    --config_dir ./configs/custom \
21 | #    --gpus 0 1 \
22 | #    > err.log 2>&1 &
23 | 
24 | # Deploy using bentoml
25 | #DEPLOY_PATH="save/test_bert_for_classification_entity_with_nationality_119_2/deploy/BertTextClassificationService/20201111102110_209866"
26 | DEPLOY_PATH="save/test_bert_for_classification_entity_with_nationality_119_14/deploy/BertTextClassificationService/20201117122539_5AA389"
27 | DEPLOY_MODE="serve-gunicorn"
28 | #DEPLOY_MODE="serve"
29 | TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5000 --debug --enable-microbatch > ewn_deploy.log 2>&1 &
30 | echo "Start ewn service."
31 | 


--------------------------------------------------------------------------------
/aispace/layers/fusions/feed_forward_add_and_norm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-05 15:44
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : feed_forward_add_and_norm.py
 6 | 
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.utils.tf_utils import get_initializer
12 | 
13 | 
14 | class FeedForwardAddAndNorm(tf.keras.layers.Layer):
15 |     """Ref to Bert feed forward and add & norm module"""
16 |     def __init__(self, hparams: Hparams, **kwargs):
17 |         super(FeedForwardAddAndNorm, self).__init__(**kwargs)
18 | 
19 |         self.dense = tf.keras.layers.Dense(
20 |             hparams.hidden_size,
21 |             kernel_initializer=get_initializer(hparams.initializer_range),
22 |             name="dense"
23 |         )
24 |         self.layer_norm = tf.keras.layers.LayerNormalization(
25 |             epsilon=hparams.layer_norm_eps,
26 |             name="LayerNorm"
27 |         )
28 |         self.dropout = tf.keras.layers.Dropout(
29 |             hparams.hidden_dropout_prob
30 |         )
31 | 
32 |     def call(self, inputs, training=False):
33 |         hidden_states, input_tensor = inputs
34 | 
35 |         hidden_states = self.dense(hidden_states)
36 |         hidden_states = self.dropout(hidden_states, training=training)
37 |         hidden_states = self.layer_norm(hidden_states + input_tensor)
38 |         return hidden_states


--------------------------------------------------------------------------------
/scrpts/start_training_dureader_yesno.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Training
 4 | export CUDA_VISIBLE_DEVICES=0,1
 5 | nohup python -u aispace/trainer.py \
 6 |     --experiment_name test \
 7 |     --model_name bert_for_classification \
 8 |     --schedule train_and_eval \
 9 |     --enable_xla False \
10 |     --config_name dureader_yesno \
11 |     --config_dir ./configs/qa \
12 |     --gpus 0 1 \
13 |     > err.log 2>&1 &
14 | 
15 | # Build deployment package
16 | #nohup python -u aispace/trainer.py \
17 | #    --experiment_name test \
18 | #    --model_name bert_for_classification \
19 | #    --schedule deploy \
20 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/test_bert_for_classification_dureader__yesno_119_3 \
21 | #    --config_name dureader_yesno \
22 | #    --config_dir ./configs/qa \
23 | #    --gpus 0 1 \
24 | #    > err.log 2>&1 &
25 | 
26 | # Deploy using bentoml
27 | #DEPLOY_PATH="save/test_bert_for_classification_119_10/deploy/BertTextClassificationService/20200923161737_E41815"
28 | #DEPLOY_PATH="save/test_bert_for_classification_dureader__yesno_119_3/deploy/BertTextClassificationService/20201030101511_312D5D"
29 | #DEPLOY_MODE="serve-gunicorn"
30 | ##DEPLOY_MODE="serve"
31 | ##export BENTOML__APISERVER__DEFAULT_PORT=5000
32 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5000 --debug --enable-microbatch > dureader_yesno_deploy.log 2>&1 &
33 | #echo "Start dureader_yesno service."


--------------------------------------------------------------------------------
/aispace/layers/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-13 20:39
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | __all__ = [
 8 |     "LOSSES"
 9 | ]
10 | 
11 | from prettytable import PrettyTable
12 | import tensorflow as tf
13 | import tensorflow_addons as tfa
14 | 
15 | from .dice_loss import DceLoss, CceDceLoss
16 | from .cross_entropy_loss import SigmoidCrossEntropy
17 | from .focal_loss import SparseSoftmaxFocalCrossEntropy
18 | 
19 | LOSSES = {
20 |     "categorical_crossentropy":
21 |         lambda loss_config: tf.keras.losses.CategoricalCrossentropy(**loss_config),
22 |     "sparse_categorical_crossentropy":
23 |         lambda loss_config: tf.keras.losses.SparseCategoricalCrossentropy(**loss_config),
24 |     'sigmoid_focal_crossentropy':
25 |         lambda loss_config: tfa.losses.SigmoidFocalCrossEntropy(**loss_config),
26 |     "sparse_softmax_focal_crossentropy":
27 |         lambda loss_config: SparseSoftmaxFocalCrossEntropy(**loss_config),
28 |     'sigmoid_cross_entropy':
29 |         lambda loss_config: SigmoidCrossEntropy(**loss_config),
30 |     'dce_loss':
31 |         lambda loss_config: DceLoss(**loss_config),
32 |     "cce_dce_loss":
33 |         lambda loss_config: CceDceLoss(**loss_config)
34 | }
35 | 
36 | 
37 | def print_available():
38 |     table = PrettyTable(["NAME"])
39 |     for key in LOSSES:
40 |         table.add_row([key])
41 |     print()
42 |     print(table)


--------------------------------------------------------------------------------
/aispace/models/pretrained/bert_for_pretraining.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-05 20:24
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : bert_for_pretraining.py
 6 | 
 7 | 
 8 | from aispace.utils.hparams import Hparams
 9 | from aispace.models.base_model import BaseModel
10 | from aispace.layers.pretrained.bert import Bert, BertMLMTask, BertNSPTask
11 | 
12 | 
13 | class BertForPreTraining(BaseModel):
14 |     def __init__(self, hparams: Hparams, **kwargs):
15 |         super(BertForPreTraining, self).__init__(hparams, **kwargs)
16 |         pretrained_hparams = hparams.pretrained
17 |         self.bert = Bert(pretrained_hparams, name="bert")
18 |         self.nsp = BertNSPTask(pretrained_hparams.config, name="nsp___cls")
19 |         self.mlm = BertMLMTask(pretrained_hparams.config, input_embeddings=self.bert.embeddings, name="mlm___cls")
20 | 
21 |     def call(self, inputs, **kwargs):
22 |         outputs = self.bert(inputs, **kwargs)
23 | 
24 |         sequence_output, pooled_output = outputs[:2]
25 |         prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
26 |         seq_relationship_score = self.nsp(pooled_output)
27 | 
28 |         outputs = (prediction_scores, seq_relationship_score,) + outputs[
29 |                                                                  2:]  # add hidden states and attention if they are here
30 | 
31 |         return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
32 | 


--------------------------------------------------------------------------------
/aispace/utils/misc.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-04 13:38
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : misc.py
 6 | 
 7 | import random
 8 | import numpy as np
 9 | import tensorflow as tf
10 | 
11 | 
12 | def set_random_seed(seed):
13 |     tf.random.set_seed(seed)
14 |     np.random.seed(seed)
15 |     random.seed(seed)
16 | 
17 | 
18 | def set_visible_devices(gpus_idxs=[]):
19 |     gpus = tf.config.experimental.list_physical_devices('GPU')
20 |     if gpus:
21 |         if not gpus_idxs:
22 |             gpus = gpus
23 |         else:
24 |             gpus = [gpus[idx] for idx in gpus_idxs]
25 |         # Restrict TensorFlow to only use the first GPU
26 |         try:
27 |             tf.config.experimental.set_visible_devices(gpus, 'GPU')
28 |             for gpu in gpus:
29 |                 tf.config.experimental.set_memory_growth(gpu, True)
30 |             print(len(gpus), "Physical GPUs,")
31 |         except RuntimeError as e:
32 |             # Visible devices must be set before GPUs have been initialized
33 |             print(e)
34 | 
35 | 
36 | def set_xla(enable_xla=False):
37 |     """Config eager context according to flag values using TF 2.0 API."""
38 |     if enable_xla:
39 |         tf.config.optimizer.set_jit(True)
40 |         # Disable PinToHostOptimizer in grappler when enabling XLA because it
41 |         # causes OOM and performance regression.
42 |         tf.config.optimizer.set_experimental_options(
43 |             {'pin_to_host_optimization': False}
44 |         )
45 | 


--------------------------------------------------------------------------------
/scrpts/start_training_dureader_robust.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # training
 4 | export CUDA_VISIBLE_DEVICES=1,2
 5 | nohup python -u aispace/trainer.py \
 6 |     --experiment_name test \
 7 |     --model_name bert_for_qa \
 8 |     --schedule train_and_eval \
 9 |     --enable_xla False \
10 |     --config_name dureader_robust \
11 |     --config_dir ./configs/qa \
12 |     --gpus 0 1 \
13 |     > err.log 2>&1 &
14 | 
15 | # Build deployment package
16 | #nohup python -u aispace/trainer.py \
17 | #    --experiment_name test \
18 | #    --model_name bert_for_qa \
19 | #    --schedule deploy \
20 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/test_bert_for_qa_dureader__robust_119_30 \
21 | #    --config_name dureader_robust \
22 | #    --config_dir ./configs/qa \
23 | #    --gpus 0 1 \
24 | #    > err.log 2>&1 &
25 | #
26 | ### Deploy using bentoml
27 | ##DEPLOY_PATH="/search/odin/yyk/workspace/AiSpace/save/test_bert_for_qa_119_87/deploy/BertQAService/20201010113638_2A62F8"
28 | #DEPLOY_PATH="/search/odin/yyk/workspace/AiSpace/save/test_bert_for_qa_dureader__robust_119_30/deploy/BertQAService/20201019104240_3C7077"
29 | #DEPLOY_MODE="serve-gunicorn"
30 | ##DEPLOY_MODE="serve"
31 | ##TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5001 --debug > dureader_robust_deploy.log 2>&1 &
32 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5001 --debug --enable-microbatch --workers 1 > dureader_robust_deploy.log 2>&1 &
33 | #echo "Start dureader_robust service."


--------------------------------------------------------------------------------
/aispace/constants.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-04 15:40
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : constants.py
 6 | 
 7 | import tensorflow as tf
 8 | import tensorflow_datasets as tfds
 9 | 
10 | VERSION = "0.1.0"
11 | LOGO_STR = '\n'.join(
12 |     [
13 |         '',
14 |         r'    ____     _  ______                                     ',
15 |         r'   / /\ \   (_/  ___  |_______    _____     ______   ____  ',
16 |         r'  / /--\ \  | \___   \   ____  \/  ___  \ /  _____| / ___\ ',
17 |         r' / /----\ \ | |____\  \ |____/ |  /___\ |_| |_____| \_____ ',
18 |         r'/_/      \_\|_|______/  |_____/ \_________\ ______\______/ ',
19 |         r'                      | |                                  ',
20 |         r'                      |_|                                  ',
21 |         f'AiSpace v{VERSION}',
22 |         ''
23 |     ]
24 | )
25 | 
26 | TRAIN_STAGE = "train"
27 | TEST_STAGE = "test"
28 | DEPLOY_STAGE = "deploy"
29 | 
30 | # Some names
31 | LOGGER_NAME = 'aispace_logger'
32 | TRAIN_DATA_SYMBOL = 'train'
33 | VALIDATION_DATA_SYMBOL = 'validation'
34 | TEST_DATA_SYMBOL = 'test'
35 | MYSELF_LOSS_PREFIX = 'myself'
36 | 
37 | # Schedules
38 | TRANSFORM_SCHEDULE = 'transform'
39 | 
40 | # Feature
41 | LIST_OF_INT = 'LIST_OF_INT'
42 | INT = 'INT'
43 | CLASSLABEL = 'CLASSLABEL'
44 | LIST_OF_CLASSLABEL = 'LIST_OF_CLASSLABEL'
45 | STRING = "STRING"
46 | 
47 | FEATURE_MAPPING = {
48 |     LIST_OF_INT: tfds.features.Sequence(tf.int32),
49 |     INT: tf.int32,
50 | }
51 | 
52 | # Task
53 | NER = 'NER'
54 | 


--------------------------------------------------------------------------------
/tests/layers/layers/pretrained/test_electra.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | # # @Time    : 2020-06-03 15:51
 3 | # # @Author  : yingyuankai
 4 | # # @Email   : yingyuankai@aliyun.com
 5 | # # @File    : test_electra.py
 6 | #
 7 | import unittest
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.utils.builder_utils import build_model
12 | 
13 | 
14 | class TestElectra(unittest.TestCase):
15 |     def test_electra_checkpoint(self):
16 |         hparam = Hparams()
17 |         hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/qa/dureader_robust.yml')
18 |         # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml')
19 |         # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/cmrc2018.yml')
20 |         hparam.stand_by()
21 | 
22 |         # ckpt = "/search/data1/yyk/workspace/projects/ERNIE/ernie/checkpoints"
23 |         # ckpt = "/search/data1/yyk/data/pretrained/albert/albert_large_zh_google/model.ckpt-best"
24 |         ckpt = "/search/data1/yyk/data/pretrained/bert/chinese_wwm/bert_model.ckpt"
25 |         ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1]
26 |         # ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1]
27 | 
28 |         model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
29 |         model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
30 | 
31 |         model_vars = model.trainable_variables
32 | 
33 |         print()


--------------------------------------------------------------------------------
/aispace/utils/print_utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-02 21:15
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : print_utils.py
 6 | 
 7 | import logging
 8 | import collections
 9 | from pprint import pformat
10 | 
11 | __all__ = [
12 |     "print_boxed",
13 |     "print_aispace",
14 |     "repr_ordered_dict"
15 | ]
16 | 
17 | 
18 | def print_aispace(message, aispace_version):
19 |     print('\n'.join(
20 |         [
21 |             '',
22 |             '    ____     _  ______                                     ',
23 |             r'   / /\ \   (_/  ___  |_______    _____     ______   ____  ',
24 |             r'  / /--\ \  | \___   \   ____  \/  ___  \ /  _____| / ___\ ',
25 |             r' / /----\ \ | |____\  \ |____/ |  /___\ |_| |_____| \_____ ',
26 |             r'/_/      \_\|_|______/  |_____/ \_________\ ______\______/ ',
27 |             '                      | |                                  ',
28 |             '                      |_|                                  ',
29 |             'AiSpace v{1} - {0}'.format(message, aispace_version),
30 |             ''
31 |         ]
32 |     ))
33 | 
34 | 
35 | def print_boxed(text, print_fun=print):
36 |     box_width = len(text) + 2
37 |     print_fun('')
38 |     print_fun('╒{}╕'.format('═' * box_width))
39 |     print_fun('│ {} │'.format(text.upper()))
40 |     print_fun('╘{}╛'.format('═' * box_width))
41 |     print_fun('')
42 | 
43 | 
44 | def repr_ordered_dict(d):
45 |     return '{\n  ' + ',\n  '.join('{}: {}'.format(x, pformat(y, indent=4))
46 |                               for x, y in d.items()) + '\n}'
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/layers/layers/pretrained/test_gpt2.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | # # @Time    : 2020-06-03 15:51
 3 | # # @Author  : yingyuankai
 4 | # # @Email   : yingyuankai@aliyun.com
 5 | # # @File    : test_gpt2.py
 6 | #
 7 | import unittest
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.utils.builder_utils import build_model
12 | 
13 | 
14 | class TestGpt2(unittest.TestCase):
15 |     def test_gpt2_checkpoint(self):
16 |         hparam = Hparams()
17 |         hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/test_gpt2.yml')
18 |         # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml')
19 |         # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/cmrc2018.yml')
20 |         hparam.stand_by()
21 | 
22 |         model_path = "/search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2"
23 |         # model1 = tf.keras.models.load_model(model_path)
24 |         # model_gold = model1.trainable_variables
25 | 
26 |         # ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1]
27 |         # ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1]
28 | 
29 |         model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
30 |         model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
31 |         #
32 |         model_vars = model.trainable_variables
33 | 
34 |         for itm in model_vars:
35 |             print(f"{itm.name}, {itm.shape}")
36 |             # print(itm.numpy())
37 |             # print(type(itm.numpy()))
38 |             # break
39 |         print()


--------------------------------------------------------------------------------
/tests/layers/losses/dice_loss.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | # # @Time    : 2020-04-22 19:19
 3 | # # @Author  : yingyuankai
 4 | # # @Email   : yingyuankai@aliyun.com
 5 | # # @File    : dice_loss.py
 6 | #
 7 | import os, sys
 8 | import tensorflow as tf
 9 | import unittest
10 | 
11 | from aispace.layers.losses.dice_loss import DceLoss, CceDceLoss
12 | 
13 | 
14 | def dce_tt_loss(y_true, y_pred):
15 |     smooth = 1e-10
16 |     num = (1. - y_pred) * y_pred * y_true + smooth
17 |     den = (1. - y_pred) * y_pred * y_true + y_true + smooth
18 |     loss = 1. - num / den
19 |     loss = tf.reduce_mean(loss)
20 |     return loss
21 | 
22 | 
23 | class testDceLoss(unittest.TestCase):
24 |     def test_dce_loss(self):
25 |         y_true = tf.constant([[0., 1., 0.]])
26 |         y_pre = tf.constant([[0.0, 1., 0.0]])
27 | 
28 |         dce_loss = DceLoss(smooth=1e-10)
29 |         loss = dce_loss(y_true, y_pre)
30 |         # loss = dce_tt_loss(y_true, y_pre)
31 |         print(loss)
32 | 
33 |     def test_cce_dce_loss(self):
34 |         y_true = tf.constant([[1]])
35 |         y_pre = tf.constant([[[0.1, 1.0, 0.1]]])
36 | 
37 |         cce_dce_loss = CceDceLoss(from_logits=True, label_num=3, seq_len=1)
38 |         loss = cce_dce_loss(y_true, y_pre)
39 |         print(loss)
40 | 
41 |     def test_focal_loss(self):
42 |         from tensorflow_addons.losses import sigmoid_focal_crossentropy
43 |         y_true = tf.constant([1.0, 0.0, 0.0])
44 |         y_pre = tf.constant([0.1, 0.8, 0.1])
45 |         loss = sigmoid_focal_crossentropy(y_true=y_true, y_pred=y_pre)
46 |         print(loss)
47 |         y_pre = tf.constant([0.7, 0.1, 0.1])
48 |         loss = sigmoid_focal_crossentropy(y_true=y_true, y_pred=y_pre)
49 |         print(loss)
50 | 
51 | 


--------------------------------------------------------------------------------
/aispace/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-04 11:21
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : timer.py
 6 | 
 7 | import time
 8 | 
 9 | 
10 | class Timer:
11 |     DEFAULT_TIME_FORMAT_DATE_TIME = "%Y/%m/%d %H:%M:%S"
12 |     DEFAULT_TIME_FORMAT = ["%03dms", "%02ds", "%02dm", "%02dh"]
13 | 
14 |     def __init__(self):
15 |         self.start = time.time() * 1000
16 | 
17 |     def get_current(self):
18 |         return self.get_time_hhmmss(self.start)
19 | 
20 |     def reset(self):
21 |         self.start = time.time() * 1000
22 | 
23 |     def get_time_since_start(self, format=None):
24 |         return self.get_time_hhmmss(self.start, format)
25 | 
26 |     def get_time_hhmmss(self, start=None, end=None, gap=None, format=None):
27 |         """
28 |         Calculates time since `start` and formats as a string.
29 |         """
30 |         if start is None and gap is None:
31 | 
32 |             if format is None:
33 |                 format = self.DEFAULT_TIME_FORMAT_DATE_TIME
34 | 
35 |             return time.strftime(format)
36 | 
37 |         if end is None:
38 |             end = time.time() * 1000
39 |         if gap is None:
40 |             gap = end - start
41 | 
42 |         s, ms = divmod(gap, 1000)
43 |         m, s = divmod(s, 60)
44 |         h, m = divmod(m, 60)
45 | 
46 |         if format is None:
47 |             format = self.DEFAULT_TIME_FORMAT
48 | 
49 |         items = [ms, s, m, h]
50 |         assert len(items) == len(format), "Format length should be same as items"
51 | 
52 |         time_str = ""
53 |         for idx, item in enumerate(items):
54 |             if item != 0:
55 |                 time_str = format[idx] % item + " " + time_str
56 |         return time_str.strip()


--------------------------------------------------------------------------------
/aispace/layers/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-14 20:11
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | import tensorflow as tf
 8 | import tensorflow_addons as tfa
 9 | from prettytable import PrettyTable
10 | 
11 | from .f1_score import SparseF1Score
12 | from .precision import SparsePrecision
13 | from .recall import SparseRecall
14 | 
15 | from aispace.utils.print_utils import print_boxed
16 | 
17 | __all__ = [
18 |     "METRICS",
19 |     "print_available"
20 | ]
21 | 
22 | 
23 | METRICS = {
24 |     "categorical_accuracy":
25 |         lambda config: tf.keras.metrics.CategoricalAccuracy(**config),
26 |     "sparse_categorical_accuracy":
27 |         lambda config: tf.keras.metrics.SparseCategoricalAccuracy(**config),
28 |     "sparse_categorical_crossentropy":
29 |         lambda config: tf.keras.metrics.SparseCategoricalCrossentropy(**config),
30 |     "f1_score":
31 |         lambda config: tfa.metrics.F1Score(**config),
32 |     "sparse_f1_score":
33 |         lambda config: SparseF1Score(**config),
34 |     "precision":
35 |         lambda config: tf.keras.metrics.Precision(**config),
36 |     "sparse_precision":
37 |         lambda config: SparsePrecision(**config),
38 |     "recall":
39 |         lambda config: tf.keras.metrics.Recall(**config),
40 |     "sparse_recall":
41 |         lambda config: SparseRecall(**config),
42 |     "hamming_loss":
43 |         lambda config: tfa.metrics.HammingLoss(**config),
44 |     'binary_accuracy':
45 |         lambda config: tf.keras.metrics.BinaryAccuracy(**config)
46 | }
47 | 
48 | 
49 | def print_available():
50 |     table = PrettyTable(["NAME"])
51 |     for key in METRICS:
52 |         table.add_row([key])
53 |     print()
54 |     print(table)


--------------------------------------------------------------------------------
/scrpts/start_training_duee_role_as_qa.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CUDA_VISIBLE_DEVICES=1,0,2,3
 4 | nohup python -u aispace/trainer.py \
 5 |     --experiment_name role \
 6 |     --model_name bert_for_qa \
 7 |     --schedule train_and_eval \
 8 |     --config_name DuEE_role_as_qa \
 9 |     --config_dir ./configs/2020_LSTC \
10 |     --gpus 0 1 2 3 \
11 |     > duee_role.log 2>&1 &
12 | 
13 | #    --model_load_path save/test_bert_for_qa_glue_zh__cmrc2018_119_14 \
14 | 
15 | 
16 | # Build deployment package
17 | #nohup python -u aispace/trainer.py \
18 | #    --experiment_name test \
19 | #    --model_name bert_for_qa \
20 | #    --schedule deploy \
21 | #    --model_resume_path /search/odin/yyk/workspace/AiSpace/save/role_bert_for_qa_lstc_2020__DuEE_role_119_55 \
22 | #    --config_name DuEE_role_as_qa \
23 | #    --config_dir ./configs/2020_LSTC \
24 | #    --gpus 0 \
25 | #    > err.log 2>&1 &
26 | #
27 | ## Deploy using bentoml
28 | #DEPLOY_PATH=" /search/odin/yyk/workspace/AiSpace/save/role_bert_for_qa_lstc_2020__DuEE_role_119_3/deploy/BertQAService/20210420141619_E173B6"
29 | #DEPLOY_PATH=" /search/odin/yyk/workspace/AiSpace/save/role_bert_for_qa_lstc_2020__DuEE_role_119_9/deploy/BertQAService/20210422171656_501205"
30 | #DEPLOY_PATH=" /search/odin/yyk/workspace/AiSpace/save/role_bert_for_qa_lstc_2020__DuEE_role_119_17/deploy/BertQAService//20210423143146_60581D"
31 | #DEPLOY_PATH=" /search/odin/yyk/workspace/AiSpace/save/role_bert_for_qa_lstc_2020__DuEE_role_119_52/deploy/BertQAWithImpossibleService/20210429162939_4CBD29"
32 | #DEPLOY_MODE="serve-gunicorn"
33 | ##DEPLOY_MODE="serve"
34 | #TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=0 bentoml ${DEPLOY_MODE} ${DEPLOY_PATH} --port 5003 --debug --enable-microbatch --workers 1 > event_role_as_qa_deploy.log 2>&1 &
35 | #echo "Start dureader_robust service."
36 | 


--------------------------------------------------------------------------------
/tests/models/generation/test_bert_for_text_generation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 1/12/21 2:52 PM
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_bert_for_text_generation.py
 6 | 
 7 | 
 8 | import unittest
 9 | import tensorflow as tf
10 | 
11 | from aispace.utils.hparams import Hparams
12 | from aispace.utils.builder_utils import build_model
13 | from aispace.datasets.tokenizer import CPMTokenizer
14 | 
15 | 
16 | class TestGptAdapter(unittest.TestCase):
17 |     def test_process(self):
18 |         hparam = Hparams()
19 |         hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/idiom_generator.yml')
20 |         hparam.stand_by()
21 |         hparam.cascade_set("model_load_path", "/search/odin/yyk/workspace/AiSpace/save/test_bert_for_text_generation_idiom__idiom_generator_119_23")
22 |         model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
23 |         model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
24 | 
25 |         tokenizer = CPMTokenizer(hparam.dataset.tokenizer)
26 | 
27 |         input = "春眠不觉晓"
28 |         input_tokens = tokenizer.tokenize(input) + [tokenizer.vocab.sep_token]
29 | 
30 |         input_encoded = tokenizer.encode(input_tokens)
31 | 
32 |         input_ids = tf.constant([input_encoded['input_ids']], dtype=tf.int32)
33 |         attention_mask = tf.constant([[1] * len(input_encoded['input_ids'])], dtype=tf.int32)
34 |         input_dict = {
35 |             "input_ids": input_ids,
36 |             "attention_mask": attention_mask
37 |         }
38 |         # output = model(input_dict)
39 |         output = model.generate(input_ids, **hparam.generation_attributes)
40 | 
41 |         print(input_encoded)
42 |         output = tokenizer.decode(output.numpy().reshape([-1]).tolist())
43 |         print(output)
44 | 


--------------------------------------------------------------------------------
/configs/2020_LSTC/DuEE_keyphrase.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_ner
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 |   learning_rate: 1e-4
13 |   max_epochs: 30
14 |   batch_size: 32
15 |   callbacks:
16 |     # callback name
17 |     early_stopping:
18 |       switch: true
19 |       config:
20 |         patience: 2
21 | 
22 | dataset:
23 |   name: "lstc_2020/DuEE_role"
24 |   data_dir: "./data"
25 |   transformer: "lstc_2020/DuEE_keyphrase"
26 | 
27 |   source:
28 |     train: "train"
29 |     validation: "validation[:50%]"
30 |     test: "validation[-50%:]"
31 | 
32 |   tokenizer:
33 |     max_len: 102
34 | 
35 |   inputs:
36 |     - name: input_ids
37 |       column: input_ids
38 |       type: LIST_OF_INT
39 |       max_len: 102
40 |     - name: token_type_ids
41 |       column: token_type_ids
42 |       type: LIST_OF_INT
43 |       max_len: 102
44 |     - name: attention_mask
45 |       column: attention_mask
46 |       type: LIST_OF_INT
47 |       max_len: 102
48 | 
49 |   outputs:
50 |     - name: output_1
51 |       column: labels
52 |       type: LIST_OF_CLASSLABEL
53 |       task: NER
54 |       num: 3
55 |       labels: ["B-keyphrase", "I-keyphrase", "O"]
56 |       loss:
57 |         name: myself_crf_loss
58 | #        name: sparse_categorical_crossentropy
59 | #        config:
60 | #          from_logits: true
61 |       metrics:
62 |         - name: sparse_categorical_accuracy
63 |         - name: sparse_f1_score
64 |           config:
65 |             num_classes: 3
66 |             average: "macro"
67 |             name: "macro_f1"
68 | 
69 | pretrained:
70 |     name: ERNIE_1.0_max-len-512
71 |     init_from_pretrained: true
72 |     config:
73 |       layers:
74 |         start: 0
75 |         end: 4
76 |         step: 1
77 | 
78 | 


--------------------------------------------------------------------------------
/tests/test_hannlp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-05-07 16:17
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_hannlp.py
 6 | 
 7 | 
 8 | import unittest
 9 | 
10 | # import hanlp
11 | 
12 | 
13 | class TestHannlp(unittest.TestCase):
14 |     def test_han(self):
15 | 
16 |         # tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG')
17 |         # tagger = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN_FASTTEXT_ZH)
18 |         # recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
19 |         # syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
20 |         # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL16_NEWS_BIAFFINE_ZH)
21 |         #
22 |         # pipeline = hanlp.pipeline() \
23 |         #     .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
24 |         #     .append(tokenizer, output_key='tokens') \
25 |         #     .append(tagger, output_key='part_of_speech_tags') \
26 |         #     .append(recognizer, input_key="tokens", output_key='ner_tag') \
27 |         #     .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies') \
28 |         #     .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies')
29 | 
30 |         text = "近期，蔚来美国裁员 70 人，其中有 20 人位于圣何塞的北美总部办公室和研发中心，50 人位于旧金山办公室，此外，旧金山办公室也在这次裁员中正式关闭。"
31 | 
32 |         # res = pipeline(text)
33 |         # tokens = tokenizer(text)
34 |         # res = tagger(tokens)
35 |         # print(tagger.transform.tag_vocab)
36 | 
37 |         # print(res)
38 | 
39 |     def test_jieba(self):
40 |         import jieba.posseg as psg
41 |         text = "近期，蔚来美国裁员 70 人，其中有 20 人位于圣何塞的北美总部办公室和研发中心，50 人位于旧金山办公室，此外，旧金山办公室也在这次裁员中正式关闭。"
42 |         res = psg.cut(text)
43 |         for t in res:
44 |             print(t)
45 | 


--------------------------------------------------------------------------------
/scrpts/start_deploy_duee.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  
 3 | #####################################################
 4 | # Copyright (c) 2020 Sogou, Inc. All Rights Reserved
 5 | #####################################################
 6 | # File:    start_deploy_duee.sh
 7 | # Author:  root
 8 | # Date:    2020/04/17 15:10:21
 9 | # Brief:
10 | #####################################################
11 | 
12 | 
13 | TRIGGER_DEPLOY_PATH="/search/data1/yyk/workspace/projects/AiSpace/save/trigger_bert_for_ner_119_20/deploy/BertNerService/20200424150518_52468B"
14 | ROLE_DEPLOY_PATH="/search/data1/yyk/workspace/projects/AiSpace/save/role_bert_for_role_ner_v2_119_44/deploy/RoleBertNerService/20200424150955_992C04"
15 | # reduce label
16 | #ROLE_DEPLOY_PATH="/search/data1/yyk/workspace/projects/AiSpace/save/role_bert_for_role_ner_v2_119_70/deploy/RoleBertNerService/20200428163648_B9D503"
17 | #EVENT_TYPE_DEPLOY_PATH="save/trigger_bert_for_classification_119_17/deploy/BertTextClassificationService/20200426162858_B723CC"
18 | EVENT_TYPE_DEPLOY_PATH="save/trigger_bert_for_classification_119_19/deploy/BertTextClassificationService/20200426183441_4281E0"
19 | 
20 | export BENTOML__APISERVER__DEFAULT_PORT=5000
21 | TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=4 bentoml serve $TRIGGER_DEPLOY_PATH > trigger_extract_deploy.log 2>&1 &
22 | echo "Start trigger extract service."
23 | 
24 | export BENTOML__APISERVER__DEFAULT_PORT=5001
25 | TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=4 bentoml serve $ROLE_DEPLOY_PATH > role_extract_deploy.log 2>&1 &
26 | echo "Start role extract service."
27 | 
28 | export BENTOML__APISERVER__DEFAULT_PORT=5002
29 | TF_FORCE_GPU_ALLOW_GROWTH=true CUDA_VISIBLE_DEVICES=4 bentoml serve $EVENT_TYPE_DEPLOY_PATH > event_type_extract_deploy.log 2>&1 &
30 | echo "Start event type extract service."
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | # vim: set expandtab ts=4 sw=4 sts=4 tw=100
43 | 


--------------------------------------------------------------------------------
/aispace/models/generation/bert_for_generation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 1/7/21 10:14 AM
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : bert_for_generation.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from aispace.utils.hparams import Hparams
10 | from aispace.utils.generation_tf_utils import TFGenerationMixin
11 | from aispace.models.base_model import BaseModel
12 | from aispace.layers.pretrained.bert import Bert
13 | from aispace.utils.tf_utils import get_initializer
14 | from aispace.layers import BaseLayer
15 | 
16 | __all__ = [
17 |     'BertForTextGeneration'
18 | ]
19 | 
20 | 
21 | @BaseModel.register("bert_for_text_generation")
22 | class BertForTextGeneration(BaseModel, TFGenerationMixin):
23 |     def __init__(self, hparams: Hparams, **kwargs):
24 |         super(BertForTextGeneration, self).__init__(hparams, **kwargs)
25 |         pretrained_hparams = hparams.pretrained
26 | 
27 |         assert pretrained_hparams.norm_name in ['gpt2'], \
28 |             ValueError(f"{pretrained_hparams.norm_name} not be supported.")
29 |         self.transformer = BaseLayer.by_name(pretrained_hparams.norm_name)(pretrained_hparams)
30 | 
31 |     def call(self, inputs=None, **kwargs):
32 |         transformer_outputs = self.transformer(inputs, **kwargs)
33 |         hidden_states = transformer_outputs[0]
34 |         logits = self.transformer.wte(hidden_states, mode="linear")
35 | 
36 |         return (logits,) + transformer_outputs[2:]
37 | 
38 |     def get_output_embeddings(self):
39 |         return self.transformer.wte
40 | 
41 |     def prepare_inputs_for_generation(self, inputs, past, **kwargs):
42 |         # only last token for inputs_ids if past is defined in kwargs
43 |         if past:
44 |             inputs = tf.expand_dims(inputs[:, -1], -1)
45 | 
46 |         return inputs, {"past": past, "use_cache": kwargs["use_cache"]}
47 | 
48 |     def deploy(self):
49 |         pass
50 | 


--------------------------------------------------------------------------------
/configs/glue_zh/csl.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_classification
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 | #  policy:
13 | #    name: "k-fold"
14 | #    config:
15 | #      k: 5
16 |   learning_rate: 1e-5
17 |   max_epochs: 30
18 |   batch_size: 32
19 | 
20 | #  optimizer:
21 | #    name: adam
22 | 
23 |   callbacks:
24 |     # callback name
25 |     early_stopping:
26 |       switch: true
27 |       config:
28 |         patience: 2
29 |     lr_finder:
30 |       switch: false
31 |       config:
32 |         end_lr: 1e-4
33 | 
34 |   optimizer_wrappers:
35 |     swa:
36 |       switch: false
37 |       config:
38 |         start_epoch: 5
39 | 
40 | dataset:
41 |   name: glue_zh/csl
42 |   data_dir: "./data"
43 |   transformer: "glue_zh/csl"
44 | 
45 |   source:
46 |     train: "train[:80%]"
47 |     validation: "train[-20%:]"
48 |     test: "validation"
49 | 
50 |   inputs:
51 |     - name: input_ids
52 |       column: input_ids
53 |       type: LIST_OF_INT
54 |       max_len: 128
55 |     - name: token_type_ids
56 |       column: token_type_ids
57 |       type: LIST_OF_INT
58 |       max_len: 128
59 |     - name: attention_mask
60 |       column: attention_mask
61 |       type: LIST_OF_INT
62 |       max_len: 128
63 | 
64 |   outputs:
65 |     - name: output_1
66 |       column: label
67 |       type: CLASSLABEL
68 |       num: 2
69 |       labels: ["0", "1"]
70 |       loss:
71 |         name: sparse_categorical_crossentropy
72 |         config:
73 |           from_logits: true
74 |       metrics:
75 |         - name: sparse_categorical_accuracy
76 |         - name: sparse_f1_score
77 |           config:
78 |             name: "macro_f1"
79 |             num_classes: 2
80 |             average: "macro"
81 | 
82 | pretrained:
83 |     name: ERNIE_1.0_max-len-512
84 |     init_from_pretrained: true
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/configs/glue_zh/afqmc.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_classification
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 | #  policy:
13 | #    name: "k-fold"
14 | #    config:
15 | #      k: 5
16 |   learning_rate: 1e-5
17 |   max_epochs: 30
18 |   batch_size: 8
19 | 
20 | #  optimizer:
21 | #    name: adam
22 | 
23 |   callbacks:
24 |     # callback name
25 |     early_stopping:
26 |       switch: true
27 |       config:
28 |         patience: 2
29 |     lr_finder:
30 |       switch: false
31 |       config:
32 |         end_lr: 1e-4
33 | 
34 |   optimizer_wrappers:
35 |     swa:
36 |       switch: false
37 |       config:
38 |         start_epoch: 5
39 | 
40 | dataset:
41 |   name: glue_zh/afqmc
42 |   data_dir: "./data"
43 |   transformer: "glue_zh/afqmc"
44 | 
45 |   source:
46 |     train: "train[:80%]"
47 |     validation: "train[-20%:]"
48 |     test: "validation"
49 | 
50 |   inputs:
51 |     - name: input_ids
52 |       column: input_ids
53 |       type: LIST_OF_INT
54 |       max_len: 100
55 |     - name: token_type_ids
56 |       column: token_type_ids
57 |       type: LIST_OF_INT
58 |       max_len: 100
59 |     - name: attention_mask
60 |       column: attention_mask
61 |       type: LIST_OF_INT
62 |       max_len: 100
63 | 
64 |   outputs:
65 |     - name: output_1
66 |       column: label
67 |       type: CLASSLABEL
68 |       num: 2
69 |       labels: ["0", "1"]
70 |       loss:
71 |         name: sparse_categorical_crossentropy
72 |         config:
73 |           from_logits: true
74 |       metrics:
75 |         - name: sparse_categorical_accuracy
76 |         - name: sparse_f1_score
77 |           config:
78 |             name: "macro_f1"
79 |             num_classes: 2
80 |             average: "macro"
81 | 
82 | pretrained:
83 |     name: ERNIE_1.0_max-len-512
84 |     init_from_pretrained: true
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/configs/glue_zh/iflytek.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_classification
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 | #  policy:
13 | #    name: "k-fold"
14 | #    config:
15 | #      k: 5
16 |   learning_rate: 1e-5
17 |   max_epochs: 30
18 |   batch_size: 32
19 | 
20 | #  optimizer:
21 | #    name: adam
22 | 
23 |   callbacks:
24 |     # callback name
25 |     early_stopping:
26 |       switch: true
27 |       config:
28 |         patience: 2
29 |     lr_finder:
30 |       switch: false
31 |       config:
32 |         end_lr: 1e-4
33 | 
34 |   optimizer_wrappers:
35 |     swa:
36 |       switch: false
37 |       config:
38 |         start_epoch: 5
39 | 
40 | dataset:
41 |   name: glue_zh/iflytek
42 |   data_dir: "./data"
43 |   transformer: "glue_zh/iflytek"
44 | 
45 |   source:
46 |     train: "train[:80%]"
47 |     validation: "train[-20%:]"
48 |     test: "validation"
49 | 
50 |   inputs:
51 |     - name: input_ids
52 |       column: input_ids
53 |       type: LIST_OF_INT
54 |       max_len: 512
55 |     - name: token_type_ids
56 |       column: token_type_ids
57 |       type: LIST_OF_INT
58 |       max_len: 512
59 |     - name: attention_mask
60 |       column: attention_mask
61 |       type: LIST_OF_INT
62 |       max_len: 512
63 | 
64 |   outputs:
65 |     - name: output_1
66 |       column: label
67 |       type: CLASSLABEL
68 |       num: 119
69 |       labels: use_num
70 |       loss:
71 |         name: sparse_categorical_crossentropy
72 |         config:
73 |           from_logits: true
74 |       metrics:
75 |         - name: sparse_categorical_accuracy
76 |         - name: sparse_f1_score
77 |           config:
78 |             name: "macro_f1"
79 |             num_classes: 119
80 |             average: "macro"
81 | 
82 | pretrained:
83 |     name: ERNIE_1.0_max-len-512
84 |     init_from_pretrained: true
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/aispace/layers/activations.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-05 14:04
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : activations.py
 6 | 
 7 | import math
 8 | import numpy as np
 9 | import tensorflow as tf
10 | 
11 | __all__ = [
12 |     "gelu",
13 |     "gelu_new",
14 |     "swish",
15 |     "ACT2FN"
16 | ]
17 | 
18 | 
19 | def gelu(x):
20 |     """Gaussian Error Linear Unit.
21 | 
22 |     This is a smoother version of the RELU.
23 |     Original paper: https://arxiv.org/abs/1606.08415
24 | 
25 |     :param input:
26 |     :return:
27 |     """
28 |     # cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
29 |     cdf = 0.5 * (1.0 + tf.tanh(
30 |         (math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
31 |     return x * cdf
32 | 
33 | 
34 | def gelu_new(x):
35 |     """Gaussian Error Linear Unit.
36 |     This is a smoother version of the RELU.
37 |     Original paper: https://arxiv.org/abs/1606.08415
38 |     Args:
39 |         x: float Tensor to perform activation.
40 |     Returns:
41 |         `x` with the GELU activation applied.
42 |     """
43 |     cdf = 0.5 * (1.0 + tf.tanh(
44 |         (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
45 |     return x * cdf
46 | 
47 | 
48 | def swish(x):
49 |     return x * tf.sigmoid(x)
50 | 
51 | 
52 | ACT2FN = {
53 |     "gelu": tf.keras.layers.Activation(gelu),
54 |     "swish": tf.keras.layers.Activation(swish),
55 |     "gelu_new": tf.keras.layers.Activation(gelu_new),
56 |     "elu": tf.keras.activations.elu,
57 |     "hard_sigmoid": tf.keras.activations.hard_sigmoid,
58 |     "linear": tf.keras.activations.linear,
59 |     "relu": tf.keras.activations.relu,
60 |     "selu": tf.keras.activations.selu,
61 |     "sigmoid": tf.keras.activations.sigmoid,
62 |     "softmax": tf.keras.activations.softmax,
63 |     "softplus": tf.keras.activations.softplus,
64 |     "softsign": tf.keras.activations.softsign,
65 |     "tanh": tf.keras.activations.tanh
66 | }


--------------------------------------------------------------------------------
/configs/qa/dureader_yesno.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_classification
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 798
10 | 
11 | training:
12 | #  policy:
13 | #    name: "k-fold"
14 | #    config:
15 | #      k: 5
16 |   learning_rate: 1e-5
17 |   max_epochs: 10
18 |   batch_size: 32
19 | 
20 |   callbacks:
21 |     # callback name
22 |     early_stopping:
23 |       switch: false
24 |       config:
25 |         patience: 2
26 | #    lr_finder:
27 | #      switch: true
28 | #
29 | #  optimizer:
30 | #    name: adam
31 | 
32 |   optimizer_wrappers:
33 |     swa:
34 |       switch: true
35 |       config:
36 |         start_epoch: 5
37 |     lr_multiplier:
38 |       switch: false
39 |       config:
40 |         multipliers:
41 |           bert_for_qa/bert: 0.1
42 | 
43 | dataset:
44 |   name: "dureader/yesno"
45 |   data_dir: "./data"
46 |   transformer: "dureader/yesno"
47 | 
48 |   source:
49 |     train: "train + validation"
50 |     validation: 'validation[:50%]'
51 |     test: "validation[-50%:]"
52 | 
53 |   tokenizer:
54 |     max_query_length: 64
55 | 
56 |   inputs:
57 |     - name: input_ids
58 |       column: input_ids
59 |       type: LIST_OF_INT
60 |       max_len: 512
61 |     - name: token_type_ids
62 |       column: segment_ids
63 |       type: LIST_OF_INT
64 |       max_len: 512
65 |     - name: attention_mask
66 |       column: input_mask
67 |       type: LIST_OF_INT
68 |       max_len: 512
69 | 
70 |   outputs:
71 |     - name: output_1
72 |       column: label
73 |       type: CLASSLABEL
74 |       num: 3
75 |       labels: ['no', 'yes', 'depends']
76 |       loss:
77 |         name: sparse_categorical_crossentropy
78 |         config:
79 |           from_logits: true
80 |       metrics:
81 |         - name: sparse_categorical_accuracy
82 | 
83 | pretrained:
84 |     name: ERNIE_1.0_max-len-512
85 |     init_from_pretrained: true
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/configs/glue_zh/cmnli.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_classification
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 | #  policy:
13 | #    name: "k-fold"
14 | #    config:
15 | #      k: 5
16 |   learning_rate: 1e-5
17 |   max_epochs: 30
18 |   batch_size: 32
19 | 
20 | #  optimizer:
21 | #    name: adam
22 | 
23 |   callbacks:
24 |     # callback name
25 |     early_stopping:
26 |       switch: true
27 |       config:
28 |         patience: 2
29 |     lr_finder:
30 |       switch: false
31 |       config:
32 |         end_lr: 1e-4
33 | 
34 |   optimizer_wrappers:
35 |     swa:
36 |       switch: false
37 |       config:
38 |         start_epoch: 5
39 | 
40 | dataset:
41 |   name: glue_zh/cmnli
42 |   data_dir: "./data"
43 |   transformer: "glue_zh/cmnli"
44 | 
45 |   source:
46 |     train: "train[:80%]"
47 |     validation: "train[-20%:]"
48 |     test: "validation"
49 | 
50 |   inputs:
51 |     - name: input_ids
52 |       column: input_ids
53 |       type: LIST_OF_INT
54 |       max_len: 128
55 |     - name: token_type_ids
56 |       column: token_type_ids
57 |       type: LIST_OF_INT
58 |       max_len: 128
59 |     - name: attention_mask
60 |       column: attention_mask
61 |       type: LIST_OF_INT
62 |       max_len: 128
63 | 
64 |   outputs:
65 |     - name: output_1
66 |       column: label
67 |       type: CLASSLABEL
68 |       num: 3
69 |       labels: ["neutral", "entailment", "contradiction"]
70 |       loss:
71 |         name: sparse_categorical_crossentropy
72 |         config:
73 |           from_logits: true
74 |       metrics:
75 |         - name: sparse_categorical_accuracy
76 |         - name: sparse_f1_score
77 |           config:
78 |             name: "macro_f1"
79 |             num_classes: 3
80 |             average: "macro"
81 | 
82 | pretrained:
83 |     name: ERNIE_1.0_max-len-512
84 |     init_from_pretrained: true
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/configs/2020_LSTC/DuEE_trigger_as_classifier.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/bert_wwm.yml"
 3 | 
 4 | model_name: bert_for_classification
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 |   learning_rate: 3e-5
13 |   max_epochs: 30
14 |   batch_size: 32
15 | #  steps_per_epoch: 100
16 |   callbacks:
17 |     # callback name
18 |     early_stopping:
19 |       switch: true
20 |       config:
21 |         patience: 2
22 | 
23 | dataset:
24 |   name: lstc_2020/DuEE_trigger
25 |   data_dir: "./data"
26 |   transformer: "lstc_2020/DuEE_trigger_as_classifier"
27 | 
28 |   source:
29 |     train: "train"
30 |     validation: "validation[:50%]"
31 |     test: "validation[-50%:]"
32 | 
33 |   tokenizer:
34 |     max_len: 150
35 | 
36 |   inputs:
37 |     - name: input_ids
38 |       column: input_ids
39 |       type: LIST_OF_INT
40 |       max_len: 150
41 |     - name: token_type_ids
42 |       column: token_type_ids
43 |       type: LIST_OF_INT
44 |       max_len: 150
45 |     - name: attention_mask
46 |       column: attention_mask
47 |       type: LIST_OF_INT
48 |       max_len: 150
49 | 
50 |   outputs:
51 |     - name: output_1
52 |       column: event_labels
53 |       type: LIST_OF_INT
54 |       num: 65
55 |       labels:
56 |         url: "https://ai.baidu.com/file/9C92719AF96D4DDB96477BFBE1435262"
57 |         name: "duee_event_type_labels"
58 |       loss:
59 |         name: sigmoid_focal_crossentropy
60 | #        name: categorical_crossentropy
61 |         config:
62 |           from_logits: true
63 |           reduction: "sum"
64 |       metrics:
65 |         - name: categorical_accuracy
66 |         - name: f1_score
67 |           config:
68 |             name: "macro_f1"
69 |             num_classes: 65
70 |             average: "macro"
71 |         - name: f1_score
72 |           config:
73 |             name: "micro_f1"
74 |             num_classes: 65
75 |             average: "micro"
76 | 
77 | 
78 | pretrained:
79 |     name: chinese_roberta_wwm_large_ext
80 |     init_from_pretrained: true
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/2020_LSTC/DuEE_role.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | #  - "../pretrain/bert_wwm.yml"
 3 |   - "../pretrain/ernie.yml"
 4 | 
 5 | model_name: bert_for_role_ner
 6 | 
 7 | model_attributes:
 8 |     hidden_dropout_prob: 0.5
 9 |     initializer_range: 0.02
10 |     hidden_size: 1024
11 | 
12 | training:
13 |   learning_rate: 1e-4
14 |   max_epochs: 30
15 |   batch_size: 32
16 |   callbacks:
17 |     # callback name
18 |     early_stopping:
19 |       switch: true
20 |       config:
21 |         patience: 2
22 | 
23 | dataset:
24 |   name: "lstc_2020/DuEE_role"
25 |   data_dir: "./data"
26 |   transformer: "lstc_2020/DuEE_role"
27 | 
28 |   source:
29 |     train: "train"
30 |     validation: "validation[:50%]"
31 |     test: "validation[-50%:]"
32 | 
33 |   tokenizer:
34 |     max_len: 100
35 | 
36 |   inputs:
37 |     - name: input_ids
38 |       column: input_ids
39 |       type: LIST_OF_INT
40 |       max_len: 100
41 |     - name: token_type_ids
42 |       column: token_type_ids
43 |       type: LIST_OF_INT
44 |       max_len: 100
45 |     - name: attention_mask
46 |       column: attention_mask
47 |       type: LIST_OF_INT
48 |       max_len: 100
49 |     - name: position_ids
50 |       column: position_ids
51 |       type: LIST_OF_INT
52 |       max_len: 100
53 |     - name: label_mask
54 |       column: label_mask
55 |       type: LIST_OF_INT
56 |       max_len: 243
57 | 
58 |   outputs:
59 |     - name: output_1
60 |       column: labels
61 |       type: LIST_OF_CLASSLABEL
62 |       task: NER
63 |       num: 0
64 |       labels:
65 |         url: "https://ai.baidu.com/file/9C92719AF96D4DDB96477BFBE1435262"
66 |         name: "duee_role_ner_labels"
67 |       loss:
68 |         name: myself_crf_loss
69 | #        name: sparse_categorical_crossentropy
70 | #        config:
71 | #          from_logits: true
72 |       metrics:
73 |         - name: sparse_categorical_accuracy
74 |         - name: sparse_f1_score
75 |           config:
76 |             num_classes: 243
77 |             average: "macro"
78 |             name: "macro_f1"
79 | 
80 | pretrained:
81 | #    name: chinese_roberta_wwm_ext
82 |     name: ERNIE_1.0_max-len-512
83 |     init_from_pretrained: true
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/configs/custom/ewn.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | #  - "../pretrain/albert_chinese.yml"
 4 | 
 5 | model_name: bert_for_classification
 6 | 
 7 | model_attributes:
 8 |     hidden_dropout_prob: 0.5
 9 |     initializer_range: 0.02
10 |     hidden_size: 1024
11 | 
12 | training:
13 | #  policy:
14 | #    name: "k-fold"
15 | #    config:
16 | #      k: 5
17 |   learning_rate: 1e-5
18 |   max_epochs: 30
19 |   batch_size: 32
20 | 
21 | #  optimizer:
22 | #    name: adam
23 | 
24 |   callbacks:
25 |     # callback name
26 |     early_stopping:
27 |       switch: true
28 |       config:
29 |         patience: 2
30 |     lr_finder:
31 |       switch: false
32 |       config:
33 |         end_lr: 1e-4
34 | 
35 |   optimizer_wrappers:
36 |     swa:
37 |       switch: false
38 |       config:
39 |         start_epoch: 5
40 | 
41 | dataset:
42 |   name: entity_with_nationality
43 |   data_dir: "./data"
44 |   transformer: "entity_with_nationality"
45 | 
46 |   source:
47 |     train: "train"
48 |     validation: "validation"
49 |     test: "test"
50 | 
51 |   tokenizer:
52 |     max_len: 20
53 | 
54 |   inputs:
55 |     - name: input_ids
56 |       column: input_ids
57 |       type: LIST_OF_INT
58 |       max_len: 20
59 |     - name: token_type_ids
60 |       column: token_type_ids
61 |       type: LIST_OF_INT
62 |       max_len: 20
63 |     - name: attention_mask
64 |       column: attention_mask
65 |       type: LIST_OF_INT
66 |       max_len: 20
67 | 
68 |   outputs:
69 |     - name: output_1
70 |       column: label
71 |       type: CLASSLABEL
72 |       num: 2
73 |       labels: use_num
74 |       loss:
75 |         name: sparse_categorical_crossentropy
76 |         config:
77 |           from_logits: true
78 |       metrics:
79 |         - name: sparse_categorical_accuracy
80 |         - name: sparse_f1_score
81 |           config:
82 |             name: "macro_f1"
83 |             num_classes: 2
84 |             average: "macro"
85 | 
86 | pretrained:
87 |     name: ERNIE_1.0_max-len-512
88 | #    name: albert_base_zh
89 |     init_from_pretrained: true
90 |     config:
91 |       layers:
92 |         start: 0
93 |         end: 4
94 |         step: 1
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/aispace/layers/metrics/f1_score.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-05-20 12:40
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : f1_score.py
 6 | 
 7 | import tensorflow as tf
 8 | import tensorflow_addons as tfa
 9 | 
10 | from aispace.utils.tf_utils import get_shape
11 | 
12 | __all__ = [
13 |     "SparseF1Score"
14 | ]
15 | 
16 | 
17 | class SparseF1Score(tfa.metrics.FBetaScore):
18 |     def __init__(
19 |         self,
20 |         num_classes,
21 |         average: str = None,
22 |         threshold=None,
23 |         name: str = "sparse_f1_score",
24 |         dtype=None,
25 |         **kwargs
26 |     ):
27 |         super(SparseF1Score, self).__init__(num_classes, average, 1.0, name=name, dtype=dtype)
28 |         self.threshold = threshold
29 | 
30 |     def get_config(self):
31 |         base_config = super().get_config()
32 |         del base_config["beta"]
33 |         return base_config
34 | 
35 |     def update_state(self, y_true, y_pred, sample_weight=None):
36 |         if self.threshold is None:
37 |             threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
38 |             # make sure [0, 0, 0] doesn't become [1, 1, 1]
39 |             # Use abs(x) > eps, instead of x != 0 to check for zero
40 |             y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
41 |         else:
42 |             y_pred = y_pred > self.threshold
43 | 
44 |         y_true = tf.cast(y_true, tf.int32)
45 |         y_true = tf.one_hot(y_true, self.num_classes)
46 |         y_true = tf.reshape(y_true, [-1, self.num_classes])
47 |         y_pred = tf.reshape(y_pred, [-1, self.num_classes])
48 | 
49 |         y_true = tf.cast(y_true, tf.int32)
50 |         y_pred = tf.cast(y_pred, tf.int32)
51 | 
52 |         def _count_non_zero(val):
53 |             non_zeros = tf.math.count_nonzero(val, axis=self.axis)
54 |             return tf.cast(non_zeros, self.dtype)
55 | 
56 |         self.true_positives.assign_add(_count_non_zero(y_pred * y_true))
57 |         self.false_positives.assign_add(_count_non_zero(y_pred * (y_true - 1)))
58 |         self.false_negatives.assign_add(_count_non_zero((y_pred - 1) * y_true))
59 |         self.weights_intermediate.assign_add(_count_non_zero(y_true))


--------------------------------------------------------------------------------
/configs/custom/gov_title_trigger.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 |   - "../pretrain/ernie.yml"
 3 | 
 4 | model_name: bert_for_ner
 5 | 
 6 | model_attributes:
 7 |     hidden_dropout_prob: 0.5
 8 |     initializer_range: 0.02
 9 |     hidden_size: 1024
10 | 
11 | training:
12 | #  policy:
13 | #    name: "k-fold"
14 | #    config:
15 | #      k: 5
16 |   learning_rate: 1e-5
17 |   max_epochs: 30
18 |   batch_size: 8
19 | 
20 | #  optimizer:
21 | #    name: adam
22 | 
23 |   callbacks:
24 |     # callback name
25 |     early_stopping:
26 |       switch: true
27 |       config:
28 |         patience: 2
29 |     lr_finder:
30 |       switch: false
31 |       config:
32 |         end_lr: 1e-4
33 | 
34 |   optimizer_wrappers:
35 |     swa:
36 |       switch: false
37 |       config:
38 |         start_epoch: 5
39 | 
40 | dataset:
41 |   name: gov_title/trigger
42 |   data_dir: "./data"
43 |   data_path: "./data/downloads/extracted/gov_title/gov_title_trigger.txt"
44 |   transformer: "gov_title/trigger"
45 | 
46 |   source:
47 |     train: "train[:80%]"
48 |     validation: "train[80%:90%]"
49 |     test: "train[-10%:]"
50 | 
51 |   tokenizer:
52 |     max_len: 512
53 | 
54 |   inputs:
55 |     - name: input_ids
56 |       column: input_ids
57 |       type: LIST_OF_INT
58 |       max_len: 512
59 |     - name: token_type_ids
60 |       column: token_type_ids
61 |       type: LIST_OF_INT
62 |       max_len: 512
63 |     - name: attention_mask
64 |       column: attention_mask
65 |       type: LIST_OF_INT
66 |       max_len: 512
67 | 
68 |   outputs:
69 |     - name: output_1
70 |       column: label
71 |       type: LIST_OF_CLASSLABEL
72 |       task: NER
73 |       num: 3
74 |       labels: ["B-TITLE", "I-TITLE", "O"]
75 |       loss:
76 | #        name: myself_crf_loss
77 | #        name: sparse_softmax_focal_crossentropy
78 |         name: sparse_categorical_crossentropy
79 |         config:
80 |           from_logits: True
81 |       metrics:
82 |         - name: sparse_categorical_accuracy
83 |         - name: sparse_f1_score
84 |           config:
85 |             name: "macro_f1"
86 |             num_classes: 3
87 |             average: "macro"
88 | 
89 | pretrained:
90 |     name: ERNIE_1.0_max-len-512
91 |     init_from_pretrained: true
92 | #    config:
93 | #      layers:
94 | #        start: 0
95 | #        end: 4
96 | #        step: 1
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/aispace/models/classifications/textcnn_for_sequence_classification.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-05 20:44
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : bert_for_sequence_classification.py
 6 | 
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.models.base_model import BaseModel
12 | from aispace.layers.encoders import TextcnnBlock
13 | from aispace.utils.tf_utils import get_initializer
14 | from aispace.layers import BaseLayer
15 | 
16 | 
17 | @BaseModel.register("textcnn_for_classification")
18 | class TextcnnForSeqClassification(BaseModel):
19 |     def __init__(self, hparams: Hparams, **kwargs):
20 |         super(TextcnnForSeqClassification, self).__init__(hparams, **kwargs)
21 |         self.num_lables = hparams.dataset.outputs[0].num
22 |         model_hparams = hparams.model_attributes
23 | 
24 |         self.embeddings = tf.keras.layers.Embedding(
25 |             model_hparams.vocab_size,
26 |             model_hparams.hidden_size,
27 |             embeddings_initializer=get_initializer(model_hparams.initializer_range),
28 |             name="embeddings"
29 |         )
30 | 
31 |         self.encoder = TextcnnBlock(model_hparams.filters, model_hparams.windows, model_hparams.initializer_range, name="textcnn_encoder")
32 |         self.dropout = tf.keras.layers.Dropout(
33 |             model_hparams.hidden_dropout_prob
34 |         )
35 |         self.project = tf.keras.layers.Dense(
36 |             model_hparams.hidden_size,
37 |             kernel_initializer=get_initializer(model_hparams.initializer_range),
38 |             name="project"
39 |         )
40 |         self.classifier = tf.keras.layers.Dense(
41 |             self.num_lables,
42 |             kernel_initializer=get_initializer(model_hparams.initializer_range),
43 |             name="classifier"
44 |         )
45 | 
46 |     def call(self, inputs, **kwargs):
47 |         emb = self.embeddings(inputs['input_ids'])
48 |         output = self.encoder(emb, **kwargs)
49 | 
50 |         project = self.project(output)
51 | 
52 |         project = self.dropout(project, training=kwargs.get('training', False))
53 | 
54 |         logits = self.classifier(project)
55 | 
56 |         outputs = (logits,)
57 | 
58 |         return outputs  # logits, (hidden_states), (attentions)
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/configs/glue_zh/wsc.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 |   - "../pretrain/ernie.yml"
  3 | 
  4 | model_name: bert_for_relation_extract
  5 | 
  6 | model_attributes:
  7 |     hidden_dropout_prob: 0.5
  8 |     initializer_range: 0.02
  9 |     hidden_size: 1024
 10 |     num_attention_heads: 8
 11 |     attention_probs_dropout_prob: 0.5
 12 | 
 13 | training:
 14 | #  policy:
 15 | #    name: "k-fold"
 16 | #    config:
 17 | #      k: 5
 18 |   learning_rate: 1e-5
 19 |   max_epochs: 30
 20 |   batch_size: 32
 21 | 
 22 | #  optimizer:
 23 | #    name: adam
 24 | 
 25 |   callbacks:
 26 |     # callback name
 27 |     early_stopping:
 28 |       switch: true
 29 |       config:
 30 |         patience: 2
 31 |     lr_finder:
 32 |       switch: false
 33 |       config:
 34 |         end_lr: 1e-4
 35 | 
 36 |   optimizer_wrappers:
 37 |     swa:
 38 |       switch: false
 39 |       config:
 40 |         start_epoch: 5
 41 | 
 42 | dataset:
 43 |   name: glue_zh/wsc
 44 |   data_dir: "./data"
 45 |   transformer: "glue_zh/wsc"
 46 | 
 47 |   source:
 48 |     train: "train[:80%]"
 49 |     validation: "train[-20%:]"
 50 |     test: "validation"
 51 | 
 52 |   tokenizer:
 53 |     max_len: 100
 54 | 
 55 |   inputs:
 56 |     - name: input_ids
 57 |       column: input_ids
 58 |       type: LIST_OF_INT
 59 |       max_len: 512
 60 |     - name: token_type_ids
 61 |       column: token_type_ids
 62 |       type: LIST_OF_INT
 63 |       max_len: 512
 64 |     - name: attention_mask
 65 |       column: attention_mask
 66 |       type: LIST_OF_INT
 67 |       max_len: 512
 68 |     - name: entity_span_start
 69 |       column: entity_span_start
 70 |       type: LIST_OF_INT
 71 |       max_len: 2
 72 |     - name: entity_span_end
 73 |       column: entity_span_end
 74 |       type: LIST_OF_INT
 75 |       max_len: 2
 76 | 
 77 |   outputs:
 78 |     - name: output_1
 79 |       column: label
 80 |       type: CLASSLABEL
 81 |       num: 2
 82 |       labels: ["true", "false"]
 83 |       loss:
 84 |         name: sparse_categorical_crossentropy
 85 |         config:
 86 |           from_logits: true
 87 |       metrics:
 88 |         - name: sparse_categorical_accuracy
 89 |         - name: sparse_f1_score
 90 |           config:
 91 |             name: "macro_f1"
 92 |             num_classes: 2
 93 |             average: "macro"
 94 | 
 95 | pretrained:
 96 |     name: ERNIE_1.0_max-len-512
 97 |     init_from_pretrained: true
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/tests/test_glue_zh.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-23 15:01
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_glue.py
 6 | 
 7 | import os, sys
 8 | import tensorflow_datasets as tfds
 9 | import unittest
10 | 
11 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12 | 
13 | from tensorflow_datasets.core.download import DownloadConfig
14 | 
15 | from aispace.datasets import *
16 | from aispace.utils.hparams import Hparams
17 | from aispace.utils.builder_utils import load_dataset
18 | 
19 | 
20 | class TestGlue(unittest.TestCase):
21 |     def test_glue_load(self):
22 |         hparams = Hparams()
23 |         hparams.load_from_config_file("../configs/custom/ewn.yml")
24 |         hparams.stand_by()
25 |         # checksum_dir = "../aispace/datasets/url_checksums"
26 |         # tfds.download.add_checksums_dir(checksum_dir)
27 |         # download_config = DownloadConfig(register_checksums=True)
28 |         # cmrc2018 = tfds.load("glue_zh/cmrc2018",
29 |         #                   # data_dir="/search/data1/yyk/data/datasets/glue_zh",
30 |         #                   data_dir="../data/glue_zh",
31 |         #                   builder_kwargs={'hparams': hparams},
32 |         #                   download_and_prepare_kwargs={'download_config': download_config}
33 |         #                   )
34 | 
35 |         # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False))
36 |         test_dataset = next(load_dataset(hparams, ret_train=False, ret_dev=True, ret_test=False,ret_info=False))[0]
37 | 
38 |         total, zero = 0, 0
39 |         for itm in test_dataset:
40 |             print(itm)
41 |             break
42 |             # tt = itm[0]['start_position'].numpy().tolist()
43 |             # print(itm[0]['p_mask'].numpy().tolist())
44 |             # print(itm[0]['start_position'].numpy().tolist())
45 |             # print(itm[0]['end_position'].numpy().tolist())
46 |             # break
47 |             # total += len(tt)
48 |             # zero += len([t for t in tt if t == 0])
49 |         # print()
50 |         # print(f"{zero}, {total}, {zero / float(total)}")
51 | 
52 | # python -u aispace/trainer.py \
53 | #    --experiment_name test \
54 | #    --model_name bert_for_classification \
55 | #    --schedule train_and_eval \
56 | #    --config_name tnews \
57 | #    --config_dir ./configs/glue_zh \
58 | #    --gpus 0 1 2 3


--------------------------------------------------------------------------------
/configs/pretrain/gpt.yml:
--------------------------------------------------------------------------------
 1 | # config for ernie
 2 | 
 3 | includes:
 4 |   - "../base.yml"
 5 | 
 6 | dataset:
 7 |     tokenizer:
 8 |         name: cpm_tokenizer
 9 |         vocab:
10 |             filename: null
11 |             special_tokens:
12 |                 PAD: "<pad>"
13 |                 UNK: "<unk>"
14 |                 SEP: "<sep>"
15 |                 MASK: "<mask>"
16 |                 EOD: "<eod>"
17 |                 BOS: "<s>"
18 |                 EOS: "</s>"
19 |             errors: 'replace'
20 |         tokenize_chinese_chars: True
21 |         do_lower_case: True
22 |         do_basic_tokenize: True
23 |         non_split_tokens: null
24 |         max_len: 512
25 | 
26 | pretrained:
27 |     norm_name: gpt2
28 |     name: CPM-LM-TF2
29 |     adapter: tf_huggingface_gpt2_adapter
30 |     force_download: false
31 |     init_from_pretrained: true
32 |     cache_dir: /search/odin/yyk/data/pretrained/gpt   # your path to save models
33 |     model_path: null
34 |     vocab_path: null
35 |     config_path: null
36 |     config:
37 |         output_attentions: false
38 |         output_hidden_states: false
39 |         use_cache: true
40 |         use_return_dict: false
41 |         hidden_size: 2560
42 |         layer_norm_eps: 1e-12
43 | 
44 |     ref: https://github.com/qhduan/CPM-LM-TF2
45 |     family:
46 |         CPM-LM-TF2:
47 |             model:
48 |                 # your/path/to/gpt
49 |                 url: /search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2
50 | #                suffix: bert_model.ckpt
51 |                 to_insert_paths:
52 |                     - pretrained.model_path
53 |             vocab:
54 |                 # your/path/to/ERNIE_stable-1.0.1/vocab.txt
55 |                 url: /search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2/chinese_vocab.model
56 |                 to_insert_paths:
57 |                     - pretrained.vocab_path
58 |                     - dataset.tokenizer.vocab.filename
59 |             config:
60 |                 # your/path/to/config or url
61 | #                url: https://huggingface.co/gpt2/resolve/main/config.json
62 |                 url: /search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2/config.json
63 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
64 |                     - pretrained.config_path
65 |                 to_replaces: # replace pretrained.config with the json content.
66 |                     - pretrained.config


--------------------------------------------------------------------------------
/aispace/layers/optimizers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-14 20:50
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : __init__.py
 6 | 
 7 | import tensorflow as tf
 8 | import tensorflow_addons as tfa
 9 | from prettytable import PrettyTable
10 | 
11 | from .adam_weight_decay_optimizer import create_awdwwu_optimizer
12 | from .lr_multiplier import LRMultiplier
13 | # from keras_lr_multiplier import LRMultiplier
14 | 
15 | __all__ = [
16 |     "OPTIMIZERS",
17 |     "OPTIMIZER_WRAPPER",
18 |     "print_available"
19 | ]
20 | 
21 | OPTIMIZERS = {
22 |     'sgd':
23 |         lambda training_hparams: tf.keras.optimizers.SGD(
24 |             learning_rate=training_hparams.learning_rate
25 |         ),
26 |     'adam':
27 |         lambda training_hparams: tf.keras.optimizers.Adam(
28 |             learning_rate=training_hparams.learning_rate, epsilon=1e-08, clipnorm=1.0),
29 |     'adam_weight_decay_with_warm_up': create_awdwwu_optimizer,
30 |     'radam':
31 |         lambda training_hparams:
32 |         tfa.optimizers.RectifiedAdam(training_hparams.learning_rate,
33 |                                      total_steps=training_hparams.steps_per_epoch * training_hparams.max_epochs,
34 |                                      warmup_proportion=min(training_hparams.warmup_factor,
35 |                                                            float(training_hparams.steps_per_epoch) / (
36 |                                                                    training_hparams.steps_per_epoch * training_hparams.max_epochs)),
37 |                                      weight_decay=0.01)
38 | }
39 | 
40 | OPTIMIZER_WRAPPER = {
41 |     'swa':
42 |         lambda opt, training_hparams:
43 |         tfa.optimizers.SWA(opt, start_averaging=training_hparams.steps_per_epoch *
44 |                                                 training_hparams.optimizer_wrappers.get('swa').config.start_epoch,
45 |                            average_period=training_hparams.steps_per_epoch // 10),
46 |     # TODO
47 |     # 'lr_multiplier': lambda opt, training_hparams: LRMultiplier(opt, multipliers=training_hparams.optimizer_wrappers.get('lr_multiplier').config.multipliers)
48 | }
49 | 
50 | 
51 | def print_available():
52 |     table = PrettyTable(["NAME"])
53 |     for key in OPTIMIZERS:
54 |         table.add_row([key])
55 | 
56 |     for key in OPTIMIZER_WRAPPER:
57 |         table.add_row([key])
58 |     print()
59 |     print(table)
60 | 


--------------------------------------------------------------------------------
/aispace/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-04 19:53
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : file_utils.py
 6 | 
 7 | import sys, os
 8 | from pathlib import Path
 9 | import tensorflow as tf
10 | import tarfile
11 | import zipfile
12 | from six.moves import urllib
13 | import requests
14 | import logging
15 | 
16 | __all__ = [
17 |     "default_download_dir",
18 |     "set_default_download_dir",
19 |     "DEFAULT_AISPACE_DOWNLOAD_DIR",
20 |     "maybe_create_dir",
21 | ]
22 | 
23 | DEFAULT_AISPACE_DOWNLOAD_DIR = None
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | 
28 | def default_download_dir(name):
29 |     r"""Return the directory to which packages will be downloaded by default.
30 |     """
31 |     global DEFAULT_AISPACE_DOWNLOAD_DIR  # pylint: disable=global-statement
32 |     if DEFAULT_AISPACE_DOWNLOAD_DIR is None:
33 |         if sys.platform == 'win32' and 'AISPACEDATA' in os.environ:
34 |             # On Windows, use %APPDATA%
35 |             home_dir = Path(os.environ['AISPACEDATA'])
36 |         else:
37 |             # Otherwise, install in the user's home directory.
38 |             home_dir = Path(os.environ["HOME"])
39 | 
40 |         if os.access(str(home_dir), os.W_OK):
41 |             DEFAULT_AISPACE_DOWNLOAD_DIR = home_dir / 'aispace_data'
42 |         else:
43 |             raise ValueError("The path {} is not writable. Please manually "
44 |                              "specify the download directory".format(home_dir))
45 | 
46 |     if not DEFAULT_AISPACE_DOWNLOAD_DIR.exists():
47 |         DEFAULT_AISPACE_DOWNLOAD_DIR.mkdir(parents=True)
48 | 
49 |     return DEFAULT_AISPACE_DOWNLOAD_DIR / name
50 | 
51 | 
52 | def set_default_download_dir(path):
53 |     if isinstance(path, str):
54 |         path = Path(path)
55 |     elif not isinstance(path, Path):
56 |         raise ValueError("`path` must be a string or a pathlib.Path object")
57 | 
58 |     if not os.access(str(path), os.W_OK):
59 |         raise ValueError(
60 |             "The specified download directory {} is not writable".format(path))
61 | 
62 |     global DEFAULT_AISPACE_DOWNLOAD_DIR  # pylint: disable=global-statement
63 |     DEFAULT_AISPACE_DOWNLOAD_DIR = path
64 | 
65 | 
66 | def maybe_create_dir(dirname):
67 |     """Creates directory if doesn't exist
68 |     """
69 |     if not tf.io.gfile.isdir(dirname):
70 |         tf.io.gfile.makedirs(dirname)
71 |         return True
72 |     return False
73 | 


--------------------------------------------------------------------------------
/aispace/layers/decoders/crf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-15 11:15
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : crf.py
 6 | 
 7 | 
 8 | import tensorflow as tf
 9 | import tensorflow_addons as tfa
10 | 
11 | from aispace.utils.tf_utils import get_initializer, get_shape
12 | 
13 | 
14 | __all__ = [
15 |     "CRFLayer"
16 | ]
17 | 
18 | 
19 | class CRFLayer(tf.keras.layers.Layer):
20 |     def __init__(self,
21 |                  num_labels,
22 |                  initializer_range,
23 |                  label_mask=None,
24 |                  **kwargs):
25 |         super(CRFLayer, self).__init__(**kwargs)
26 |         self.num_labels = num_labels
27 |         self.initializer_range = initializer_range
28 |         if label_mask is not None:
29 |             self.label_mask = tf.constant(label_mask)
30 |         else:
31 |             self.label_mask = None
32 | 
33 |     def build(self, input_shape):
34 |         self.transition_params = self.add_weight(
35 |             "transition_params",
36 |             shape=[self.num_labels, self.num_labels],
37 |             initializer=get_initializer(self.initializer_range)
38 |         )
39 |         if self.label_mask is not None:
40 |             label_mask = tf.cast(self.label_mask, tf.float32)
41 |             label_mask = (1.0 - label_mask) * -10000.0
42 |             self.transition_params += label_mask
43 | 
44 |         super(CRFLayer, self).build(input_shape)
45 | 
46 |     def call(self, inputs, **kwargs):
47 |         score, self.sequence_length = inputs
48 |         viterbi, viterbi_score = tfa.text.crf_decode(score, self.transition_params, self.sequence_length)
49 |         return viterbi, viterbi_score
50 | 
51 |     def loss(self, y_true, y_pred):
52 |         """Computes the log-likelihood of tag sequences in a CRF.
53 |         Args:
54 |             y_true : A (batch_size, n_steps) tensor.
55 |             y_pred : A (batch_size, n_steps, n_classes) tensor.
56 |         Returns:
57 |             loss: A scalar containing the log-likelihood of the given sequence of tag indices.
58 |         """
59 |         batch_size, n_steps, _ = get_shape(y_pred)
60 |         y_true = tf.cast(tf.reshape(y_true, [batch_size, n_steps]), dtype='int32')
61 |         log_likelihood, self.transition_params = \
62 |             tfa.text.crf_log_likelihood(y_pred, y_true, self.sequence_length, self.transition_params)
63 |         loss = tf.reduce_mean(-log_likelihood)
64 |         return loss


--------------------------------------------------------------------------------
/configs/custom/idiom_generator.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 |   - "../pretrain/gpt.yml"
  3 |   - "../generation/text_generation.yml"
  4 | 
  5 | model_name: bert_for_text_generation
  6 | 
  7 | model_attributes:
  8 |     hidden_dropout_prob: 0.5
  9 |     initializer_range: 0.02
 10 |     hidden_size: 1024
 11 | 
 12 | generation_attributes:
 13 |   do_sample: true
 14 |   num_beams: 3
 15 |   temperature: 0.7
 16 |   repetition_penalty: 2.0
 17 |   num_return_sequences: 1
 18 |   max_length: 300
 19 |   early_stopping: true
 20 | 
 21 | training:
 22 | #  policy:
 23 | #    name: "k-fold"
 24 | #    config:
 25 | #      k: 5
 26 |   learning_rate: 1e-5
 27 |   max_epochs: 1
 28 |   batch_size: 4
 29 |   do_eval: false
 30 | #  optimizer:
 31 | #    name: adam
 32 | 
 33 |   callbacks:
 34 |     # callback name
 35 |     early_stopping:
 36 |       switch: true
 37 |       config:
 38 |         patience: 2
 39 |     lr_finder:
 40 |       switch: false
 41 |       config:
 42 |         end_lr: 1e-4
 43 | 
 44 |   optimizer_wrappers:
 45 |     swa:
 46 |       switch: false
 47 |       config:
 48 |         start_epoch: 5
 49 | 
 50 | dataset:
 51 |   name: idiom/idiom_generator
 52 |   data_dir: "./data"
 53 |   transformer: "idiom/idiom_generator"
 54 | 
 55 |   source:
 56 |     train: "train[:80%]"
 57 |     validation: "train[80%:90%]"
 58 |     test: "train[-10%:]"
 59 | 
 60 |   tokenizer:
 61 |     max_len: 200
 62 | 
 63 |   inputs:
 64 |     - name: input_ids
 65 |       column: input_ids
 66 |       type: LIST_OF_INT
 67 |       max_len: 200
 68 | #    - name: token_type_ids
 69 | #      column: token_type_ids
 70 | #      type: LIST_OF_INT
 71 | #      max_len: 10
 72 |     - name: attention_mask
 73 |       column: attention_mask
 74 |       type: LIST_OF_INT
 75 |       max_len: 200
 76 | 
 77 |   outputs:
 78 |     - name: output_1
 79 |       column: label
 80 |       type: LIST_OF_CLASSLABEL
 81 |       num: 30000
 82 |       labels: vocab
 83 |       loss:
 84 |         name: sparse_categorical_crossentropy
 85 |         config:
 86 |           from_logits: false
 87 |       metrics:
 88 |         - name: sparse_categorical_accuracy
 89 |         - name: sparse_f1_score
 90 |           config:
 91 |             name: "macro_f1"
 92 |             num_classes: 30000
 93 |             average: "macro"
 94 | 
 95 | pretrained:
 96 |     name: CPM-LM-TF2
 97 |     init_from_pretrained: true
 98 |     config:
 99 |       layers:
100 |         start: 0
101 |         end: 4
102 |         step: 1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../../'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'AiSpace'
21 | copyright = '2020, yingyuankai@aliyun.com'
22 | author = 'yingyuankai@aliyun.com'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '0.1.0'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.coverage',
36 |     'sphinx.ext.napoleon',
37 |     'recommonmark',
38 |     'sphinx.ext.viewcode',
39 |     'sphinx_markdown_tables'
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | # The suffix(es) of source filenames.
46 | # You can specify multiple suffix as a list of string:
47 | #
48 | source_suffix = ['.rst', '.md']
49 | 
50 | # The master toctree document.
51 | master_doc = 'index'
52 | 
53 | # List of patterns, relative to source directory, that match files and
54 | # directories to ignore when looking for source files.
55 | # This pattern also affects html_static_path and html_extra_path.
56 | exclude_patterns = []
57 | 
58 | 
59 | # -- Options for HTML output -------------------------------------------------
60 | 
61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = 'sphinx_rtd_theme'
65 | 
66 | # Add any paths that contain custom static files (such as style sheets) here,
67 | # relative to this directory. They are copied after the builtin static files,
68 | # so a file named "default.css" will overwrite the builtin "default.css".
69 | html_static_path = ['_static']


--------------------------------------------------------------------------------
/tests/test_gov_title.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-23 15:01
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_glue.py
 6 | 
 7 | import os, sys
 8 | import tensorflow as tf
 9 | import tensorflow_datasets as tfds
10 | import unittest
11 | 
12 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | 
14 | from tensorflow_datasets.core.download import DownloadConfig
15 | 
16 | from aispace.datasets import *
17 | from aispace.utils.hparams import Hparams
18 | from aispace.utils.builder_utils import load_dataset
19 | 
20 | 
21 | class TestLSTC(unittest.TestCase):
22 |     def test_lstc_load(self):
23 |         hparams = Hparams()
24 |         hparams.load_from_config_file("../configs/custom/gov_title_role.yml")
25 |         hparams.stand_by()
26 |         checksum_dir = "../aispace/datasets/url_checksums"
27 |         tfds.download.add_checksums_dir(checksum_dir)
28 |         # download_config = DownloadConfig(register_checksums=True)
29 |         tnews = tfds.load("gov_title/role",
30 |                           # data_dir="/search/data1/yyk/data/datasets/glue_zh",
31 |                           data_dir="../data",
32 |                           builder_kwargs={'hparams': hparams},
33 |                           # download_and_prepare_kwargs={'download_config': download_config}
34 |                           )
35 | 
36 |         tokenizer = BertTokenizer(hparams.dataset.tokenizer)
37 |         # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()}
38 |         label_counter = {}
39 |         for itm in tnews["train"]:
40 |             # for k, v in itm.items():
41 |             #     if v.shape[0] == 151:
42 |             #         print(itm)
43 |             #         break
44 |             print(itm)
45 |             print()
46 |             print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()]))
47 |             break
48 |             # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()]
49 |             # print(id_to_label[l])
50 |             # if id_to_label[l] not in label_counter:
51 |             #     label_counter[id_to_label[l]] = 0
52 |             # label_counter[id_to_label[l]] += 1
53 |         # print(label_counter)
54 |         # print(len(label_counter))
55 | 
56 | # python -u aispace/trainer.py \
57 | #    --experiment_name test \
58 | #    --model_name bert_for_classification \
59 | #    --schedule train_and_eval \
60 | #    --config_name tnews \
61 | #    --config_dir ./configs/glue_zh \
62 | #    --gpus 0 1 2 3


--------------------------------------------------------------------------------
/aispace/utils/checkpoint_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-18 14:20
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : checkpoint_utils.py
 6 | 
 7 | __all__ = [
 8 |     "average_checkpoints"
 9 | ]
10 | 
11 | import os
12 | import logging
13 | import tensorflow as tf
14 | 
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | def average_checkpoints(model, prefix_or_checkpints, num_last_checkpoints=None, ckpt_weights=None):
20 |     """average checkpoints
21 | 
22 |     :param model_variables:
23 |     :param checkpints:
24 |     :param num_last_checkpoints:
25 |     :return:
26 |     """
27 |     avg_weights = None
28 | 
29 |     if isinstance(prefix_or_checkpints, (list, tuple)):
30 |         ckpts = prefix_or_checkpints
31 |     elif prefix_or_checkpints.find(',') != -1 or not os.path.exists(prefix_or_checkpints):
32 |         # checkpoints
33 |         ckpts = prefix_or_checkpints.split(",")
34 |     elif os.path.isdir(prefix_or_checkpints):
35 |         # prefix, i.e., directory of checkpoints
36 |         ckpts = tf.train.get_checkpoint_state(prefix_or_checkpints).all_model_checkpoint_paths
37 |         if num_last_checkpoints:
38 |             ckpts = ckpts[-num_last_checkpoints:]
39 |             ckpt_weights = ckpt_weights[-num_last_checkpoints:]
40 |     else:
41 |         raise ValueError(f"{prefix_or_checkpints} is wrong!")
42 | 
43 |     if ckpt_weights is None:
44 |         ckpt_weights = [1.] * len(ckpts)
45 | 
46 |     assert len(ckpt_weights) == len(ckpts), \
47 |         ValueError(f"size of ckpt_weights ({len(ckpt_weights)}) must be equal to the size of ckpts ({len(ckpts)}).")
48 | 
49 |     for idx, ckpt in enumerate(ckpts):
50 |         logger.info(f"Merge weights from {ckpt} with ckpt weight {ckpt_weights[idx]}")
51 |         model.load_weights(ckpt)
52 |         model_weights = model.get_weights()
53 |         model_weights = _weights_time(model_weights, ckpt_weights[idx])
54 |         if idx == 0:
55 |             avg_weights = model_weights
56 |             continue
57 |         avg_weights = _weights_add(avg_weights, model_weights)
58 | 
59 |     avg_weights = _weights_div(avg_weights, sum(ckpt_weights))
60 |     model.set_weights(avg_weights)
61 | 
62 | 
63 | def _weights_add(weights1, weights2):
64 |     weights = [w1 + w2 for w1, w2 in zip(weights1, weights2)]
65 |     return weights
66 | 
67 | 
68 | def _weights_div(weights, num):
69 |     weights = [w / num for w in weights]
70 |     return weights
71 | 
72 | 
73 | def _weights_time(weights, ckpt_w):
74 |     weights = [w * ckpt_w for w in weights]
75 |     return weights


--------------------------------------------------------------------------------
/tests/test_dureader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-23 15:01
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_glue.py
 6 | 
 7 | import os, sys
 8 | import tensorflow_datasets as tfds
 9 | import unittest
10 | 
11 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12 | 
13 | from tensorflow_datasets.core.download import DownloadConfig
14 | from tqdm import tqdm
15 | from aispace.datasets import *
16 | from aispace.utils.hparams import Hparams
17 | from aispace.utils.builder_utils import load_dataset
18 | 
19 | 
20 | class TestGlue(unittest.TestCase):
21 |     def test_glue_load(self):
22 |         hparams = Hparams()
23 |         hparams.load_from_config_file("../configs/qa/dureader_yesno.yml")
24 |         hparams.stand_by()
25 |         checksum_dir = "../aispace/datasets/url_checksums"
26 |         tfds.download.add_checksums_dir(checksum_dir)
27 |         download_config = DownloadConfig(register_checksums=True)
28 |         print(tfds.list_builders())
29 |         dureader = tfds.load("dureader/yesno",
30 |                           # data_dir="/search/data1/yyk/data/datasets/glue_zh",
31 |                           data_dir="../data/dureader",
32 |                           builder_kwargs={'hparams': hparams},
33 |                           download_and_prepare_kwargs={'download_config': download_config}
34 |                           )
35 |         for itm in dureader['train']:
36 |             print(itm)
37 |             break
38 |         print()
39 | 
40 |         # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False))
41 |         # test_dataset = next(load_dataset(hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True))[0]
42 | 
43 |         # total, zero = 0, 0
44 |         # for itm in tqdm(test_dataset):
45 |             # tt = itm[0]['input_ids'].numpy().tolist()
46 |             # print(itm[0]['p_mask'].numpy().tolist())
47 |             # print(itm[0]['start_position'].numpy().tolist())
48 |             # print(itm[0]['end_position'].numpy().tolist())
49 |             # print(tt)
50 |             # break
51 |             # total += 1
52 |             # zero += len([t for t in tt if t == 0])
53 |         # print()
54 |         # print(f"{zero}, {total}, {zero / float(total)}")
55 |         # print(total)
56 | 
57 | 
58 | # python -u aispace/trainer.py \
59 | #    --experiment_name test \
60 | #    --model_name bert_for_classification \
61 | #    --schedule train_and_eval \
62 | #    --config_name tnews \
63 | #    --config_dir ./configs/glue_zh \
64 | #    --gpus 0 1 2 3


--------------------------------------------------------------------------------
/configs/custom/gov_title_role.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 |   - "../pretrain/ernie.yml"
  3 | 
  4 | #model_name: bert_for_ner_with_title_status
  5 | model_name: bert_for_ner
  6 | 
  7 | model_attributes:
  8 |     hidden_dropout_prob: 0.5
  9 |     initializer_range: 0.02
 10 |     hidden_size: 1024
 11 | 
 12 | training:
 13 | #  policy:
 14 | #    name: "k-fold"
 15 | #    config:
 16 | #      k: 5
 17 |   learning_rate: 1e-5
 18 |   max_epochs: 30
 19 |   batch_size: 8
 20 | 
 21 | #  optimizer:
 22 | #    name: adam
 23 | 
 24 |   callbacks:
 25 |     # callback name
 26 |     early_stopping:
 27 |       switch: true
 28 |       config:
 29 |         patience: 2
 30 |     lr_finder:
 31 |       switch: false
 32 |       config:
 33 |         end_lr: 1e-4
 34 | 
 35 |   optimizer_wrappers:
 36 |     swa:
 37 |       switch: false
 38 |       config:
 39 |         start_epoch: 5
 40 | 
 41 | dataset:
 42 |   name: gov_title/role
 43 |   data_dir: "./data"
 44 |   data_path: "./data/downloads/extracted/gov_title/gov_title_event.txt"
 45 |   transformer: "gov_title/role"
 46 | 
 47 |   source:
 48 |     train: "train[:80%]"
 49 |     validation: "train[80%:90%]"
 50 |     test: "train[-10%:]"
 51 | 
 52 |   tokenizer:
 53 |     max_len: 512
 54 | 
 55 |   inputs:
 56 |     - name: input_ids
 57 |       column: input_ids
 58 |       type: LIST_OF_INT
 59 |       max_len: 512
 60 |     - name: token_type_ids
 61 |       column: token_type_ids
 62 |       type: LIST_OF_INT
 63 |       max_len: 512
 64 |     - name: attention_mask
 65 |       column: attention_mask
 66 |       type: LIST_OF_INT
 67 |       max_len: 512
 68 |     - name: position_ids
 69 |       column: position_ids
 70 |       type: LIST_OF_INT
 71 |       max_len: 512
 72 | 
 73 |   outputs:
 74 |     - name: output_1
 75 |       column: label
 76 |       type: LIST_OF_CLASSLABEL
 77 |       task: NER
 78 |       num: 11
 79 |       labels: ["O", "B-PERSON", "I-PERSON", "B-ORGANIZATION", "I-ORGANIZATION",
 80 |                "B-LOCATION", "I-LOCATION", "B-START_DATE", "I-START_DATE", "B-END_DATE", "I-END_DATE"]
 81 |       loss:
 82 |         name: myself_crf_loss
 83 | #        name: sparse_categorical_crossentropy
 84 | #        config:
 85 | #          from_logits: true
 86 |       metrics:
 87 |         - name: sparse_categorical_accuracy
 88 |         - name: sparse_f1_score
 89 |           config:
 90 |             name: "macro_f1"
 91 |             num_classes: 11
 92 |             average: "macro"
 93 | 
 94 | pretrained:
 95 |     name: ERNIE_1.0_max-len-512
 96 |     init_from_pretrained: true
 97 | #    config:
 98 | #      layers:
 99 | #        start: 0
100 | #        end: 4
101 | #        step: 1
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/tests/test_idiom.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-23 15:01
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_glue.py
 6 | 
 7 | import os, sys
 8 | import tensorflow as tf
 9 | import tensorflow_datasets as tfds
10 | import unittest
11 | 
12 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | 
14 | from tensorflow_datasets.core.download import DownloadConfig
15 | 
16 | from aispace.datasets import *
17 | from aispace.utils.hparams import Hparams
18 | from aispace.utils.builder_utils import load_dataset
19 | 
20 | 
21 | class TestLSTC(unittest.TestCase):
22 |     def test_lstc_load(self):
23 |         hparams = Hparams()
24 |         hparams.load_from_config_file("../configs/custom/test_gpt2.yml")
25 |         hparams.stand_by()
26 |         checksum_dir = "../aispace/datasets/url_checksums"
27 |         tfds.download.add_checksums_dir(checksum_dir)
28 |         # download_config = DownloadConfig(register_checksums=True)
29 |         tnews = tfds.load("idiom/idiom_generator",
30 |                           # data_dir="/search/data1/yyk/data/datasets/glue_zh",
31 |                           split="train[90%:]",
32 |                           data_dir="../data",
33 |                           builder_kwargs={'hparams': hparams},
34 |                           # download_and_prepare_kwargs={'download_config': download_config}
35 |                           )
36 | 
37 |         tokenizer = BertTokenizer(hparams.dataset.tokenizer)
38 |         # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()}
39 |         label_counter = {}
40 |         i = 0
41 |         for itm in tnews:
42 |             # for k, v in itm.items():
43 |             #     if v.shape[0] == 151:
44 |             #         print(itm)
45 |             #         break
46 |             # print(itm)
47 |             # print()
48 |             # print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()]))
49 |             # break
50 |             i += 1
51 |             # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()]
52 |             # print(id_to_label[l])
53 |             # if id_to_label[l] not in label_counter:
54 |             #     label_counter[id_to_label[l]] = 0
55 |             # label_counter[id_to_label[l]] += 1
56 |         # print(label_counter)
57 |         # print(len(label_counter))
58 |         print(i)
59 | # python -u aispace/trainer.py \
60 | #    --experiment_name test \
61 | #    --model_name bert_for_classification \
62 | #    --schedule train_and_eval \
63 | #    --config_name tnews \
64 | #    --config_dir ./configs/glue_zh \
65 | #    --gpus 0 1 2 3


--------------------------------------------------------------------------------
/tests/test_2020_lstc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-23 15:01
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : test_glue.py
 6 | 
 7 | import os, sys
 8 | import tensorflow as tf
 9 | import tensorflow_datasets as tfds
10 | import unittest
11 | 
12 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13 | 
14 | from tensorflow_datasets.core.download import DownloadConfig
15 | 
16 | from aispace.datasets import *
17 | from aispace.utils.hparams import Hparams
18 | from aispace.utils.builder_utils import load_dataset
19 | 
20 | 
21 | class TestLSTC(unittest.TestCase):
22 |     def test_lstc_load(self):
23 |         hparams = Hparams()
24 |         hparams.load_from_config_file("../configs/2020_LSTC/DuEE_keyphrase.yml")
25 |         hparams.stand_by()
26 |         checksum_dir = "../aispace/datasets/url_checksums"
27 |         tfds.download.add_checksums_dir(checksum_dir)
28 |         # download_config = DownloadConfig(register_checksums=True)
29 |         tnews = tfds.load("lstc_2020/DuEE_role",
30 |                           # data_dir="/search/data1/yyk/data/datasets/glue_zh",
31 |                           data_dir="../data",
32 |                           builder_kwargs={'hparams': hparams},
33 |                           # download_and_prepare_kwargs={'download_config': download_config}
34 |                           )
35 | 
36 |         # tokenizer = BertTokenizer(hparams.dataset.tokenizer)
37 |         # s = "BCI下架新疆棉花产品"
38 |         # res = tokenizer.tokenize(s, True)
39 |         # print(res)
40 |         # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()}
41 |         label_counter = {}
42 |         for itm in tnews["train"]:
43 |             # for k, v in itm.items():
44 |             #     if v.shape[0] == 151:
45 |             #         print(itm)
46 |             #         break
47 |             print(itm)
48 |             print()
49 |             # print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()]))
50 |             break
51 |             # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()]
52 |             # print(id_to_label[l])
53 |             # if id_to_label[l] not in label_counter:
54 |             #     label_counter[id_to_label[l]] = 0
55 |             # label_counter[id_to_label[l]] += 1
56 |         print(label_counter)
57 |         print(len(label_counter))
58 | 
59 | # python -u aispace/trainer.py \
60 | #    --experiment_name test \
61 | #    --model_name bert_for_classification \
62 | #    --schedule train_and_eval \
63 | #    --config_name tnews \
64 | #    --config_dir ./configs/glue_zh \
65 | #    --gpus 0 1 2 3


--------------------------------------------------------------------------------
/configs/generation/text_generation.yml:
--------------------------------------------------------------------------------
 1 | generation_attributes:
 2 |   # The maximum length of the sequence to be generated.
 3 |   max_length: 20
 4 |   # The minimum length of the sequence to be generated.
 5 |   min_length: 10
 6 |   # Whether or not to use sampling ; use greedy decoding otherwise.
 7 |   do_sample: false
 8 |   # Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
 9 |   early_stopping: false
10 |   # Number of beams for beam search. 1 means no beam search.
11 |   num_beams: 1
12 |   # The value used to module the next token probabilities.
13 |   temperature: 1.0
14 |   # The number of highest probability vocabulary tokens to keep for top-k-filtering.
15 |   top_k: 50
16 |   # If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or
17 |   #    higher are kept for generation.
18 |   top_p: 1.0
19 |   # The parameter for repetition penalty. 1.0 means no penalty. See `this paper
20 |   #    <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
21 |   repetition_penalty: 1.0
22 | #  # The id of the `padding` token.
23 | #  pad_token_id: None
24 | #  # The id of the `beginning-of-sequence` token.
25 | #  bos_token_id: None
26 | #  # The id of the `end-of-sequence` token.
27 | #  eos_token_id: None
28 |   # Exponential penalty to the length. 1.0 means no penalty.
29 |   #
30 |   #    Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
31 |   #    order to encourage the model to produce longer sequences.
32 |   length_penalty: 1.0
33 |   # If set to int > 0, all ngrams of that size can only occur once.
34 |   no_repeat_ngram_size: 0
35 | #  # List of token ids that are not allowed to be generated. In order to get the tokens of the words that
36 | #  #    should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
37 | #  bad_words_ids: []
38 |   # The number of independently computed returned sequences for each element in the batch.
39 |   num_return_sequences: 1
40 |   # Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
41 |   #    tokens that are not masked, and 0 for masked tokens.
42 |   #
43 |   #    If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token.
44 |   #
45 |   #    `What are attention masks? <../glossary.html#attention-mask>`__
46 |   attention_mask: None
47 |   # If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
48 |   decoder_start_token_id: None
49 |   # Whether or not the model should use the past last key/values attentions (if applicable to the model) to
50 |   #    speed up decoding.
51 |   use_cache: true
52 | 


--------------------------------------------------------------------------------
/aispace/datasets/data_transformers/ewn_transformer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-01-10 15:38
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : tnew_transformer.py
 6 | 
 7 | 
 8 | import os
 9 | import logging
10 | from tqdm import tqdm
11 | import json
12 | import numpy as np
13 | import traceback
14 | from .base_transformer import BaseTransformer
15 | from aispace.datasets import BaseTokenizer
16 | from aispace.utils.io_utils import json_dumps
17 | from aispace.utils.str_utils import preprocess_text
18 | 
19 | __all__ = [
20 |     "EntityWithNationalityTransformer",
21 | ]
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | @BaseTransformer.register("entity_with_nationality")
27 | class EntityWithNationalityTransformer(BaseTransformer):
28 |     def __init__(self, hparams, **kwargs):
29 |         super(EntityWithNationalityTransformer, self).__init__(hparams, **kwargs)
30 | 
31 |         # tokenizer
32 |         self.tokenizer = \
33 |             BaseTokenizer. \
34 |                 by_name(self._hparams.dataset.tokenizer.name) \
35 |                 (self._hparams.dataset.tokenizer)
36 | 
37 |         # json dir
38 |         self.json_dir = os.path.join(kwargs.get("data_dir", self._hparams.dataset.data_dir), "json")
39 | 
40 |     def transform(self, data_path, split="train"):
41 |         # output_path_base = os.path.join(os.path.dirname(data_path), "json")
42 |         # if not os.path.exists(output_path_base):
43 |         #     os.makedirs(output_path_base)
44 |         # output_path = os.path.join(output_path_base, f"{split}.json")
45 |         with open(data_path, "r", encoding="utf8") as inf:
46 |             for line in inf:
47 |                 if not line: continue
48 |                 line = line.strip()
49 |                 if len(line) == 0: continue
50 |                 # line = str(line).strip("'<>() ").replace('\'', '\"')
51 |                 try:
52 |                     line_json = json.loads(line)
53 |                 except Exception as e:
54 |                     print(line)
55 |                     traceback.print_exc()
56 |                     continue
57 |                 sentence = line_json.get("entity", "").strip()
58 |                 if len(sentence) == 0: continue
59 |                 encode_info = self.tokenizer.encode(sentence)
60 |                 input_ids, token_type_ids, attention_mask = \
61 |                     encode_info['input_ids'], encode_info['segment_ids'], encode_info['input_mask']
62 |                 label = line_json.get("label")
63 |                 item = {
64 |                     "input_ids": input_ids,
65 |                     "token_type_ids": token_type_ids,
66 |                     "attention_mask": attention_mask,
67 |                     "label": label
68 |                 }
69 |                 yield item
70 | 


--------------------------------------------------------------------------------
/aispace/layers/metrics/precision.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-05-20 12:40
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : f1_score.py
 6 | 
 7 | import tensorflow as tf
 8 | import tensorflow_addons as tfa
 9 | 
10 | from aispace.utils.tf_utils import get_shape
11 | 
12 | __all__ = [
13 |     "SparsePrecision"
14 | ]
15 | 
16 | 
17 | class SparsePrecision(tfa.metrics.FBetaScore):
18 |     def __init__(
19 |             self,
20 |             num_classes,
21 |             average: str = None,
22 |             threshold=None,
23 |             name: str = "sparse_precision",
24 |             dtype=None,
25 |             **kwargs
26 |     ):
27 |         super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
28 | 
29 |     def get_config(self):
30 |         base_config = super().get_config()
31 |         del base_config["beta"]
32 |         return base_config
33 | 
34 |     def update_state(self, y_true, y_pred, sample_weight=None):
35 |         if self.threshold is None:
36 |             threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
37 |             # make sure [0, 0, 0] doesn't become [1, 1, 1]
38 |             # Use abs(x) > eps, instead of x != 0 to check for zero
39 |             y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
40 |         else:
41 |             y_pred = y_pred > self.threshold
42 | 
43 |         y_true = tf.cast(y_true, tf.int32)
44 |         y_true = tf.one_hot(y_true, self.num_classes)
45 |         y_true = tf.reshape(y_true, [-1, self.num_classes])
46 |         y_pred = tf.reshape(y_pred, [-1, self.num_classes])
47 | 
48 |         y_true = tf.cast(y_true, tf.int32)
49 |         y_pred = tf.cast(y_pred, tf.int32)
50 | 
51 |         def _count_non_zero(val):
52 |             non_zeros = tf.math.count_nonzero(val, axis=self.axis)
53 |             return tf.cast(non_zeros, self.dtype)
54 | 
55 |         self.true_positives.assign_add(_count_non_zero(y_pred * y_true))
56 |         self.false_positives.assign_add(_count_non_zero(y_pred * (y_true - 1)))
57 |         self.false_negatives.assign_add(_count_non_zero((y_pred - 1) * y_true))
58 |         self.weights_intermediate.assign_add(_count_non_zero(y_true))
59 | 
60 |     # @tf.function(experimental_relax_shapes=True)
61 |     def result(self):
62 |         precision = tf.math.divide_no_nan(
63 |             self.true_positives, self.true_positives + self.false_positives
64 |         )
65 |         if self.average == "weighted":
66 |             weights = tf.math.divide_no_nan(
67 |                 self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
68 |             )
69 |             precision = tf.reduce_sum(precision * weights)
70 | 
71 |         elif self.average is not None:  # [micro, macro]
72 |             precision = tf.reduce_mean(precision)
73 | 
74 |         return precision


--------------------------------------------------------------------------------
/configs/glue_zh/tnews_k_fold.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 |   - "../base.yml"
  3 | 
  4 | model_name: textcnn_for_classification
  5 | 
  6 | model_attributes:
  7 |     hidden_dropout_prob: 0.5
  8 |     initializer_range: 0.02
  9 |     hidden_size: 1024
 10 |     vocab_size: 21128
 11 |     filters:
 12 |       - 2
 13 |       - 3
 14 |       - 4
 15 |     windows:
 16 |       - 100
 17 |       - 100
 18 |       - 100
 19 | 
 20 | training:
 21 |   policy:
 22 |     name: "k-fold"
 23 |     config:
 24 |       k: 5
 25 |   learning_rate: 1e-3
 26 |   max_epochs: 3
 27 |   batch_size: 128
 28 | 
 29 |   callbacks:
 30 |     # callback name
 31 |     early_stopping:
 32 |       switch: false
 33 |       config:
 34 |         patience: 3
 35 |     checkpoint:
 36 |       switch: false
 37 |       config:
 38 |         monitor: val_macro_f1
 39 |         save_best_only: true
 40 |     lr_finder:
 41 |       switch: false
 42 | #      config:
 43 | #        max_steps: 100
 44 | #        smoothing: 0.6
 45 | 
 46 | 
 47 | dataset:
 48 |   name: glue_zh/tnews
 49 |   data_dir: "./data"
 50 |   transformer: "glue_zh/tnews"
 51 | 
 52 |   source:
 53 |     train: "train[:80%]"
 54 |     validation: "train[-20%:]"
 55 |     test: "validation"
 56 | 
 57 |   tokenizer:
 58 |       name: bert_tokenizer
 59 |       vocab:
 60 |           filename: "/search/data1/yyk/data/pretrained/nezha/NEZHA-Base/vocab.txt"
 61 |           special_tokens:
 62 |               PAD: "[PAD]"
 63 |               UNK: "[UNK]"
 64 |               SEP: "[SEP]"
 65 |               CLS: "[CLS]"
 66 |               MASK: "[MASK]"
 67 |       tokenize_chinese_chars: True
 68 |       do_lower_case: True
 69 |       do_basic_tokenize: True
 70 |       non_split_tokens: null
 71 |       max_len: 100
 72 | 
 73 |   inputs:
 74 |     - name: input_ids
 75 |       column: input_ids
 76 |       type: LIST_OF_INT
 77 |       max_len: 100
 78 |     - name: token_type_ids
 79 |       column: token_type_ids
 80 |       type: LIST_OF_INT
 81 |       max_len: 100
 82 |     - name: attention_mask
 83 |       column: attention_mask
 84 |       type: LIST_OF_INT
 85 |       max_len: 100
 86 | 
 87 |   outputs:
 88 |     - name: output_1
 89 |       column: label
 90 |       type: CLASSLABEL
 91 |       num: 15
 92 |       labels: ["news_story", "news_culture", "news_entertainment", "news_sports", "news_finance",
 93 |                "news_house", "news_car", "news_edu", "news_tech", "news_military", "news_travel",
 94 |                "news_world", "news_stock", "news_agriculture", "news_game"]
 95 |       loss:
 96 |         name: sparse_categorical_crossentropy
 97 |         config:
 98 |           from_logits: true
 99 |       metrics:
100 |         - name: sparse_categorical_accuracy
101 |         - name: sparse_f1_score
102 |           config:
103 |             name: "macro_f1"
104 |             num_classes: 15
105 |             average: "macro"
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/aispace/models/classifications/bert_for_sequence_classification.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-05 20:44
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : bert_for_sequence_classification.py
 6 | 
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.models.base_model import BaseModel
12 | from aispace.layers.pretrained.bert import Bert
13 | from aispace.utils.tf_utils import get_initializer
14 | from aispace.layers import BaseLayer
15 | 
16 | 
17 | @BaseModel.register("bert_for_classification")
18 | class BertForSeqClassification(BaseModel):
19 |     def __init__(self, hparams: Hparams, **kwargs):
20 |         super(BertForSeqClassification, self).__init__(hparams, **kwargs)
21 |         self.num_lables = hparams.dataset.outputs[0].num
22 |         pretrained_hparams = hparams.pretrained
23 |         model_hparams = hparams.model_attributes
24 | 
25 |         # self.bert = Bert(pretrained_hparams, name='bert')
26 |         assert pretrained_hparams.norm_name in ['bert', 'albert', 'albert_brightmart', "ernie", "xlnet", "electra"], \
27 |             ValueError(f"{pretrained_hparams.norm_name} not be supported.")
28 |         self.encoder = BaseLayer.by_name(pretrained_hparams.norm_name)(pretrained_hparams)
29 |         self.dropout = tf.keras.layers.Dropout(
30 |             model_hparams.hidden_dropout_prob
31 |         )
32 |         self.project = tf.keras.layers.Dense(
33 |             model_hparams.hidden_size,
34 |             kernel_initializer=get_initializer(model_hparams.initializer_range),
35 |             name="project"
36 |         )
37 |         self.classifier = tf.keras.layers.Dense(
38 |             self.num_lables,
39 |             kernel_initializer=get_initializer(model_hparams.initializer_range),
40 |             name="classifier"
41 |         )
42 | 
43 |     def call(self, inputs, **kwargs):
44 |         outputs = self.encoder(inputs, **kwargs)
45 | 
46 |         pooled_output = outputs[1]
47 |         project = self.project(pooled_output)
48 | 
49 |         project = self.dropout(project, training=kwargs.get('training', False))
50 | 
51 |         logits = self.classifier(project)
52 | 
53 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
54 | 
55 |         return outputs  # logits, (hidden_states), (attentions)
56 | 
57 |     def deploy(self):
58 |         from aispace.datasets.tokenizer import BaseTokenizer
59 |         from .bento_services import BertTextClassificationService
60 |         tokenizer = BaseTokenizer.by_name(self._hparams.dataset.tokenizer.name)(self._hparams.dataset.tokenizer)
61 |         bento_service = BertTextClassificationService()
62 |         bento_service.pack("model", self)
63 |         bento_service.pack("tokenizer", tokenizer)
64 |         bento_service.pack("hparams", self._hparams)
65 |         saved_path = bento_service.save(self._hparams.get_deploy_dir())
66 |         return saved_path
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/aispace/layers/callbacks/lr_finder.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-06-19 17:29
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : lr_finder.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | import matplotlib.pyplot as plt
10 | 
11 | 
12 | __all__ = [
13 |     "LRFinder"
14 | ]
15 | 
16 | 
17 | class LRFinder(tf.keras.callbacks.Callback):
18 |     """`Callback` that exponentially adjusts the learning rate after each training batch between `start_lr` and
19 |     `end_lr` for a maximum number of batches: `max_step`. The loss and learning rate are recorded at each step allowing
20 |     visually finding a good learning rate as per https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html via
21 |     the `plot` method.
22 |     """
23 | 
24 |     def __init__(self, start_lr: float = 1e-7, end_lr: float = 10, max_steps: int = 100, smoothing=0.9):
25 |         super(LRFinder, self).__init__()
26 |         self.start_lr, self.end_lr = start_lr, end_lr
27 |         self.max_steps = max_steps
28 |         self.smoothing = smoothing
29 |         self.step, self.best_loss, self.avg_loss, self.lr = 0, 0, 0, 0
30 |         self.lrs, self.losses = [], []
31 | 
32 |     def on_train_begin(self, logs=None):
33 |         self.step, self.best_loss, self.avg_loss, self.lr = 0, 0, 0, 0
34 |         self.lrs, self.losses = [], []
35 | 
36 |     def on_train_batch_begin(self, batch, logs=None):
37 |         self.lr = self.exp_annealing(self.step)
38 |         tf.keras.backend.set_value(self.model.optimizer.lr, self.lr)
39 | 
40 |     def on_train_batch_end(self, batch, logs=None):
41 |         logs = logs or {}
42 |         loss = logs.get('loss')
43 |         step = self.step
44 |         if loss:
45 |             self.avg_loss = self.smoothing * self.avg_loss + (1 - self.smoothing) * loss
46 |             smooth_loss = self.avg_loss / (1 - self.smoothing ** (self.step + 1))
47 |             self.losses.append(smooth_loss)
48 |             self.lrs.append(self.lr)
49 | 
50 |             if step == 0 or loss < self.best_loss:
51 |                 self.best_loss = loss
52 | 
53 |             if smooth_loss > 4 * self.best_loss or tf.math.is_nan(smooth_loss):
54 |                 self.model.stop_training = True
55 | 
56 |         if step == self.max_steps:
57 |             self.model.stop_training = True
58 | 
59 |         self.step += 1
60 | 
61 |     def exp_annealing(self, step):
62 |         return self.start_lr * (self.end_lr / self.start_lr) ** (step * 1. / self.max_steps)
63 | 
64 |     def lr_finder_plot(self, save_path):
65 |         plt.switch_backend("Agg")
66 |         fig, ax = plt.subplots(1, 1)
67 |         ax.set_ylabel('Loss')
68 |         ax.set_xlabel('Learning Rate (log scale)')
69 |         ax.set_xscale('log')
70 |         ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e'))
71 |         ax.plot(self.lrs, self.losses)
72 |         plt.savefig(save_path)
73 | 
74 |         # print(self.lrs)
75 |         # print()
76 |         # print(self.losses)


--------------------------------------------------------------------------------
/configs/pretrain/xlnet.yml:
--------------------------------------------------------------------------------
 1 | # config for xlnet chinese
 2 | includes:
 3 |   - ../base.yml
 4 | 
 5 | dataset:
 6 |     tokenizer:
 7 |         name: bert_tokenizer
 8 |         vocab:
 9 |             filename: null
10 |             special_tokens:
11 |                 PAD: [PAD]
12 |                 UNK: [UNK]
13 |                 SEP: [SEP]
14 |                 CLS: [CLS]
15 |                 MASK: [MASK]
16 |         tokenize_chinese_chars: True
17 |         do_lower_case: True
18 |         do_basic_tokenize: True
19 |         non_split_tokens: null
20 |         max_len: null
21 | 
22 | pretrained:
23 |     norm_name: xlnet
24 |     name: xlnet-base-cased
25 |     adapter: tf_huggingface_bert_adapter
26 |     force_download: false
27 |     init_from_pretrained: true
28 |     cache_dir: /search/data1/yyk/data/pretrained/xlnet   # your path to save models
29 |     model_path: null
30 |     vocab_path: null
31 |     config_path: null
32 |     config:
33 |        output_attentions: false
34 |        output_hidden_states: false
35 |        layer_norm_eps: 1e-12
36 |        hidden_size: 1024
37 | 
38 |     ref: https://github.com/ymcui/Chinese-BERT-wwm
39 |     family:
40 |         xlnet-base-cased:
41 |             model:
42 |                 url: https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5
43 |                 to_insert_paths:
44 |                     - pretrained.model_path
45 |             vocab:
46 |                 url: https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model
47 |                 to_insert_paths:
48 |                     - pretrained.vocab_path
49 |                     - dataset.tokenizer.vocab.filename
50 |             config:
51 |                 url: https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json
52 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
53 |                     - pretrained.config_path
54 |                 to_replaces: # replace pretrained.config with the json content.
55 |                     - pretrained.config
56 |         xlnet-large-cased:
57 |             model:
58 |                 url: https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5
59 |                 to_insert_paths:
60 |                     - pretrained.model_path
61 |             vocab:
62 |                 url: https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model
63 |                 to_insert_paths:
64 |                     - pretrained.vocab_path
65 |                     - dataset.tokenizer.vocab.filename
66 |             config:
67 |                 url: https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json
68 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
69 |                     - pretrained.config_path
70 |                 to_replaces: # replace pretrained.config with the json content.
71 |                     - pretrained.config
72 | 
73 | 


--------------------------------------------------------------------------------
/aispace/layers/metrics/recall.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-05-20 12:40
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : f1_score.py
 6 | 
 7 | import tensorflow as tf
 8 | import tensorflow_addons as tfa
 9 | 
10 | from aispace.utils.tf_utils import get_shape
11 | 
12 | __all__ = [
13 |     "SparseRecall"
14 | ]
15 | 
16 | 
17 | class SparseRecall(tfa.metrics.FBetaScore):
18 |     def __init__(
19 |             self,
20 |             num_classes,
21 |             average: str = None,
22 |             threshold=None,
23 |             name: str = "sparse_recall",
24 |             dtype=None,
25 |             **kwargs
26 |     ):
27 |         if tf.version.VERSION > '2.0.0':
28 |             super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
29 |         else:
30 |             super().__init__(num_classes, average, 1.0, name=name, dtype=dtype)
31 |             self.threshold = threshold
32 | 
33 |     def get_config(self):
34 |         base_config = super().get_config()
35 |         del base_config["beta"]
36 |         return base_config
37 | 
38 |     def update_state(self, y_true, y_pred, sample_weight=None):
39 |         if self.threshold is None:
40 |             threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
41 |             # make sure [0, 0, 0] doesn't become [1, 1, 1]
42 |             # Use abs(x) > eps, instead of x != 0 to check for zero
43 |             y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
44 |         else:
45 |             y_pred = y_pred > self.threshold
46 | 
47 |         y_true = tf.cast(y_true, tf.int32)
48 |         y_true = tf.one_hot(y_true, self.num_classes)
49 |         y_true = tf.reshape(y_true, [-1, self.num_classes])
50 |         y_pred = tf.reshape(y_pred, [-1, self.num_classes])
51 | 
52 |         y_true = tf.cast(y_true, tf.int32)
53 |         y_pred = tf.cast(y_pred, tf.int32)
54 | 
55 |         def _count_non_zero(val):
56 |             non_zeros = tf.math.count_nonzero(val, axis=self.axis)
57 |             return tf.cast(non_zeros, self.dtype)
58 | 
59 |         self.true_positives.assign_add(_count_non_zero(y_pred * y_true))
60 |         self.false_positives.assign_add(_count_non_zero(y_pred * (y_true - 1)))
61 |         self.false_negatives.assign_add(_count_non_zero((y_pred - 1) * y_true))
62 |         self.weights_intermediate.assign_add(_count_non_zero(y_true))
63 | 
64 |     # @tf.function(experimental_relax_shapes=True)
65 |     def result(self):
66 |         recall = tf.math.divide_no_nan(
67 |             self.true_positives, self.true_positives + self.false_negatives
68 |         )
69 |         if self.average == "weighted":
70 |             weights = tf.math.divide_no_nan(
71 |                 self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
72 |             )
73 |             recall = tf.reduce_sum(recall * weights)
74 | 
75 |         elif self.average is not None:  # [micro, macro]
76 |             recall = tf.reduce_mean(recall)
77 | 
78 |         return recall


--------------------------------------------------------------------------------
/aispace/layers/optimizers/optimizer_wrapper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-06-24 20:10
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : optimizer_wrapper.py
 6 | 
 7 | import tensorflow as tf
 8 | from typeguard import typechecked
 9 | from typing import Union
10 | 
11 | 
12 | class OptimizerWrapper(tf.keras.optimizers.Optimizer):
13 |     @typechecked
14 |     def __init__(
15 |         self,
16 |         optimizer: Union[tf.keras.optimizers.Optimizer, str],
17 |         name: str = "OptimizerWrapper",
18 |         **kwargs
19 |     ):
20 |         super().__init__(name, **kwargs)
21 | 
22 |         if isinstance(optimizer, str):
23 |             optimizer = tf.keras.optimizers.get(optimizer)
24 | 
25 |         if not isinstance(optimizer, tf.keras.optimizers.Optimizer):
26 |             raise TypeError(
27 |                 "optimizer is not an object of tf.keras.optimizers.Optimizer"
28 |             )
29 | 
30 |         self._optimizer = optimizer
31 | 
32 |     def _create_slots(self, var_list):
33 |         self._optimizer._create_slots(var_list=var_list)
34 | 
35 |     def _create_hypers(self):
36 |         self._optimizer._create_hypers()
37 | 
38 |     def _prepare(self, var_list):
39 |         return self._optimizer._prepare(var_list=var_list)
40 | 
41 |     def apply_gradients(self, grads_and_vars, name=None):
42 |         self._optimizer._iterations = self.iterations
43 |         return super().apply_gradients(grads_and_vars, name)
44 | 
45 |     def _resource_apply_dense(self, grad, var):
46 |         train_op = self._optimizer._resource_apply_dense(grad, var)
47 |         return train_op
48 | 
49 |     def _resource_apply_sparse(self, grad, var, indices):
50 |         train_op = self._optimizer._resource_apply_sparse(grad, var, indices)
51 |         return train_op
52 | 
53 |     def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
54 |         train_op = self._optimizer._resource_apply_sparse_duplicate_indices(
55 |             grad, var, indices
56 |         )
57 |         return train_op
58 | 
59 |     def get_config(self):
60 |         config = {
61 |             "optimizer": tf.keras.optimizers.serialize(self._optimizer),
62 |         }
63 |         base_config = super().get_config()
64 |         return {**base_config, **config}
65 | 
66 |     @classmethod
67 |     def from_config(cls, config, custom_objects=None):
68 |         optimizer = tf.keras.optimizers.deserialize(
69 |             config.pop("optimizer"), custom_objects=custom_objects,
70 |         )
71 |         return cls(optimizer, **config)
72 | 
73 |     @property
74 |     def lr(self):
75 |         return self._optimizer._get_hyper("learning_rate")
76 | 
77 |     @lr.setter
78 |     def lr(self, lr):
79 |         self._optimizer._set_hyper("learning_rate", lr)  #
80 | 
81 |     @property
82 |     def learning_rate(self):
83 |         return self._optimizer._get_hyper("learning_rate")
84 | 
85 |     @learning_rate.setter
86 |     def learning_rate(self, learning_rate):
87 |         self._optimizer._set_hyper("learning_rate", learning_rate)
88 | 


--------------------------------------------------------------------------------
/aispace/datasets/tokenizer/tokenizer_base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-10-31 10:58
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : tokenizer_base.py
 6 | 
 7 | from abc import ABCMeta, abstractmethod
 8 | import logging
 9 | from typing import Dict, Optional, List
10 | 
11 | from aispace.utils.hparams import Hparams
12 | from aispace.utils.registry import Registry
13 | 
14 | __all__ = [
15 |     "BaseTokenizer"
16 | ]
17 | 
18 | 
19 | SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
20 | ADDED_TOKENS_FILE = 'added_tokens.json'
21 | CONFIG_FILE = 'config.json'
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class BaseTokenizer(Registry):
27 |     __metaclass__ = ABCMeta
28 | 
29 |     _MAX_INPUT_SIZE: Dict[str, Optional[int]]
30 |     _VOCAB_FILE_NAMES: Dict[str, str]
31 | 
32 |     def __init__(self, hparams: Hparams):
33 |         self._hparams = hparams
34 |         self._vocab = None
35 | 
36 |     @abstractmethod
37 |     def tokenize(self, input):
38 |         """text to tokens
39 | 
40 |         :param input:
41 |         :return:
42 |         """
43 |         raise NotImplementedError
44 | 
45 |     @abstractmethod
46 |     def detokenizer(self, tokens: List[str]) -> str:
47 |         r""" de + tokenizer, antonym of tokenizer.
48 |         Maps a sequence of tokens (string) in a single string.
49 |         The most simple way to do it is :python:`' '.join(tokens)`, but we
50 |         often want to remove sub-word tokenization artifacts at the same time.
51 |         """
52 |         raise NotImplementedError
53 | 
54 |     @abstractmethod
55 |     def encode(self,
56 |                text_a,
57 |                text_b=None,
58 |                max_seq_length: Optional[int] = None):
59 |         """text to idx
60 | 
61 |         :param text_a:
62 |         :param text_b:
63 |         :param max_seq_length:
64 |         :return:
65 |         """
66 |         raise NotImplementedError
67 | 
68 |     @abstractmethod
69 |     def decode(self, idx):
70 |         """idx to text
71 | 
72 |         :param idx:
73 |         :return:
74 |         """
75 |         raise NotImplementedError
76 | 
77 |     @property
78 |     def vocab(self):
79 |         return self._vocab
80 | 
81 |     @property
82 |     def max_len(self):
83 |         if not self._hparams.has_key("max_len"):
84 |             logger.error("Must specify the max_len in tokenizer")
85 |             return None
86 |         return self._hparams.get("max_len")
87 | 
88 |     @staticmethod
89 |     def clean_up_tokenization(out_string: str) -> str:
90 |         r"""Clean up a list of simple English tokenization artifacts like
91 |         spaces before punctuations and abbreviated forms.
92 |         """
93 |         out_string = out_string.replace(' .', '.').replace(' ?', '?'). \
94 |             replace(' !', '!').replace(' ,', ',').replace(" ' ", "'"). \
95 |             replace(" n't", "n't").replace(" 'm", "'m"). \
96 |             replace(" do not", " don't").replace(" 's", "'s"). \
97 |             replace(" 've", "'ve").replace(" 're", "'re")
98 |         return out_string


--------------------------------------------------------------------------------
/aispace/models/question_answer/bert_for_qa.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2020/4/25 18:07
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : bert_for_qa.py
 6 | 
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | from aispace.utils.hparams import Hparams
11 | from aispace.models.base_model import BaseModel
12 | from aispace.layers import BaseLayer
13 | from aispace.layers.activations import ACT2FN
14 | from aispace.utils.tf_utils import get_initializer
15 | 
16 | __all__ = [
17 |     "BertForQA"
18 | ]
19 | 
20 | 
21 | @BaseModel.register("bert_for_qa")
22 | class BertForQA(BaseModel):
23 |     def __init__(self, hparams: Hparams, **kwargs):
24 |         super(BertForQA, self).__init__(hparams, **kwargs)
25 |         pretrained_hparams = hparams.pretrained
26 |         model_hparams = hparams.model_attributes
27 |         self.start_n_top = model_hparams.start_n_top
28 |         self.seq_len = hparams.dataset.tokenizer.max_len
29 | 
30 |         assert pretrained_hparams.norm_name not in ["xlnet_chinese"], \
31 |             ValueError(f"{pretrained_hparams.norm_name} not be supported.")
32 |         self.encode_pretrained = BaseLayer.by_name(pretrained_hparams.norm_name)(pretrained_hparams)
33 | 
34 |         self.qa_layer = BaseLayer.by_name(model_hparams.qa_layer_name)(
35 |             model_hparams.hidden_size,
36 |             self.seq_len,
37 |             self.start_n_top,
38 |             self.start_n_top,
39 |             get_initializer(model_hparams.initializer_range),
40 |             model_hparams.hidden_dropout_prob)
41 | 
42 |     def call(self, inputs, **kwargs):
43 |         is_training = kwargs.get("training", False)
44 |         new_inputs = {
45 |             "input_ids": tf.cast(inputs['input_ids'], tf.int32),
46 |             'token_type_ids': inputs['token_type_ids'],
47 |             "attention_mask": inputs['attention_mask']
48 |         }
49 |         encode_repr = self.encode_pretrained(new_inputs, **kwargs)
50 |         seq_output = encode_repr[0]  # [b, l, h]
51 |         cls_output = encode_repr[1]  # [b, h]
52 |         passage_mask = inputs['p_mask']
53 | 
54 |         if is_training:
55 |             start_position = inputs['start_position']
56 |         else:
57 |             start_position = tf.zeros_like(tf.reshape(tf.slice(inputs['input_ids'], [0, 0], [-1, 1]), [-1]))
58 | 
59 |         outputs = self.qa_layer([seq_output, cls_output, passage_mask, start_position], training=is_training)
60 | 
61 |         return outputs + (inputs['unique_id'], )
62 | 
63 |     def deploy(self):
64 |         from aispace.datasets.tokenizer import BaseTokenizer
65 |         from .bento_services import BertQAWithImpossibleService as BertQAService
66 |         # tokenizer = BertTokenizer(self._hparams.dataset.tokenizer)
67 |         tokenizer = BaseTokenizer.by_name(self._hparams.dataset.tokenizer.name)(self._hparams.dataset.tokenizer)
68 |         bento_service = BertQAService()
69 |         bento_service.pack("model", self)
70 |         bento_service.pack("tokenizer", tokenizer)
71 |         bento_service.pack("hparams", self._hparams)
72 |         saved_path = bento_service.save(self._hparams.get_deploy_dir())
73 |         return saved_path
74 | 


--------------------------------------------------------------------------------
/aispace/utils/math_utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | # @Time    : 2019-07-02 21:14
 4 | # @Author  : yingyuankai@aliyun.com
 5 | # @File    : math_utils.py
 6 | 
 7 | import math
 8 | import numpy as np
 9 | 
10 | 
11 | def jaccard(sorted_list_1, sorted_list_2):
12 |     max_jaccard_score = 0
13 |     for path1 in sorted_list_1:
14 |         for path2 in sorted_list_2:
15 |             size_set_1 = len(path1)
16 |             size_set_2 = len(path2)
17 | 
18 |             intersection = 0
19 |             for i in range(min(size_set_1, size_set_2)):
20 |                 last_p1 = path1[-(i + 1)]
21 |                 last_p2 = path2[-(i + 1)]
22 |                 if last_p1 == last_p2:
23 |                     intersection += 1
24 |                 else:
25 |                     break
26 | 
27 |             jaccard_score = intersection / (
28 |                     size_set_1 + size_set_2 - intersection)
29 |             if jaccard_score > max_jaccard_score:
30 |                 max_jaccard_score = jaccard_score
31 | 
32 |     return max_jaccard_score
33 | 
34 | 
35 | def softmax(x, temperature=1.0):
36 |     e_x = np.exp((x - np.max(x)) / temperature)
37 |     return e_x / e_x.sum()
38 | 
39 | 
40 | def int_type(num_distinct):
41 |     if num_distinct < 128:
42 |         return np.int8
43 |     elif num_distinct < 32768:
44 |         return np.int16
45 |     elif num_distinct < 2147483648:
46 |         return np.int32
47 |     else:
48 |         return np.int64
49 | 
50 | 
51 | def convert_size(size_bytes):
52 |     if size_bytes == 0:
53 |         return '0B'
54 |     size_name = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
55 |     i = int(math.floor(math.log(size_bytes, 1024)))
56 |     p = math.pow(1024, i)
57 |     s = round(size_bytes / p, 2)
58 |     return '{} {}'.format(s, size_name[i])
59 | 
60 | 
61 | def learning_rate_warmup(learning_rate, epoch, warmup_epochs, num_workers,
62 |                          steps_per_epoch):
63 |     """Implements gradual learning rate warmup:
64 |     `lr = initial_lr / hvd.size()` ---> `lr = initial_lr`
65 |      `initial_lr` is the learning rate of the model optimizer at the start
66 |      of the training. This technique was described in the paper
67 |      "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour".
68 |      See https://arxiv.org/pdf/1706.02677.pdf for details.
69 | 
70 |      Inspired by Horovod's implementation:
71 |      https://github.com/uber/horovod/blob/master/horovod/keras/callbacks.py#L202
72 |      Math recap:
73 |                                                      batch
74 |             epoch               = full_epochs + ---------------
75 |                                                 steps_per_epoch
76 |                                    lr     size - 1
77 |             lr'(epoch)          = ---- * (-------- * epoch + 1)
78 |                                   size     warmup
79 |                                    lr
80 |             lr'(epoch = 0)      = ----
81 |                                   size
82 |             lr'(epoch = warmup) = lr
83 |     """
84 |     epoch_adjusted = float(epoch) + (1. / steps_per_epoch)
85 |     return learning_rate / num_workers * \
86 |            (epoch_adjusted * (num_workers - 1) / warmup_epochs + 1)
87 | 


--------------------------------------------------------------------------------
/aispace/layers/embeddings/sharded_embedding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-12-01 17:23
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : sharded_embedding.py
 6 | 
 7 | __all__ = [
 8 |     "SharedEmbeddings"
 9 | ]
10 | 
11 | import tensorflow as tf
12 | 
13 | from aispace.utils.tf_utils import get_initializer, get_shape
14 | 
15 | 
16 | class SharedEmbeddings(tf.keras.layers.Layer):
17 |     """Construct shared token embeddings.
18 |     """
19 | 
20 |     def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
21 |         super(SharedEmbeddings, self).__init__(**kwargs)
22 |         self.vocab_size = vocab_size
23 |         self.hidden_size = hidden_size
24 |         self.initializer_range = self.hidden_size ** -0.5 if initializer_range is None else initializer_range
25 | 
26 |     def build(self, input_shape):
27 |         """Build shared word embedding layer
28 |         Shared weights logic adapted from
29 |             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
30 |         """
31 |         self.weight = self.add_weight(
32 |             "weight",
33 |             shape=[self.vocab_size, self.hidden_size],
34 |             initializer=get_initializer(self.initializer_range))
35 |         super(SharedEmbeddings, self).build(input_shape)
36 | 
37 |     def call(self, inputs, mode="embedding"):
38 |         """Get token embeddings of inputs.
39 |         Args:
40 |             inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
41 |             mode: string, a valid value is one of "embedding" and "linear".
42 |         Returns:
43 |             outputs: (1) If mode == "embedding", output embedding tensor, float32 with
44 |                 shape [batch_size, length, embedding_size]; (2) mode == "linear", output
45 |                 linear tensor, float32 with shape [batch_size, length, vocab_size].
46 |         Raises:
47 |             ValueError: if mode is not valid.
48 | 
49 |         Shared weights logic adapted from
50 |             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
51 |         """
52 |         if mode == "embedding":
53 |             return self._embedding(inputs)
54 |         elif mode == "linear":
55 |             return self._linear(inputs)
56 |         else:
57 |             raise ValueError("mode {} is not valid.".format(mode))
58 | 
59 |     def _embedding(self, input_ids):
60 |         """Applies embedding based on inputs tensor."""
61 |         return tf.gather(self.weight, input_ids)
62 | 
63 |     def _linear(self, inputs):
64 |         """Computes logits by running inputs through a linear layer.
65 |             Args:
66 |                 inputs: A float32 tensor with shape [..., hidden_size]
67 |             Returns:
68 |                 float32 tensor with shape [..., vocab_size].
69 |         """
70 |         first_dims = get_shape(inputs)[:-1]
71 | 
72 |         x = tf.reshape(inputs, [-1, self.hidden_size])
73 |         logits = tf.matmul(x, self.weight, transpose_b=True)
74 | 
75 |         return tf.reshape(logits, first_dims + [self.vocab_size])


--------------------------------------------------------------------------------
/configs/pretrain/distilbert_huggingface.yml:
--------------------------------------------------------------------------------
 1 | # config for huggingface distilbert
 2 | 
 3 | includes:
 4 |   - "../base.yml"
 5 | 
 6 | dataset:
 7 |     tokenizer:
 8 |         name: bert_tokenizer
 9 |         vocab:
10 |             filename: null
11 |             special_tokens:
12 |                 PAD: "[PAD]"
13 |                 UNK: "[UNK]"
14 |                 SEP: "[SEP]"
15 |                 CLS: "[CLS]"
16 |                 MASK: "[MASK]"
17 |         tokenize_chinese_chars: false
18 |         do_lower_case: True
19 |         do_basic_tokenize: True
20 |         non_split_tokens: null
21 |         max_len: 512
22 | 
23 | pretrained:
24 |     name: bert-base-chinese-huggingface
25 |     force_download: false
26 |     init_from_pretrained: true
27 |     cache_dir: /search/data1/yyk/data/pretrained/bert   # your path to save models
28 |     model_path: null
29 |     vocab_path: null
30 |     config_path: null
31 |     config:
32 |         output_attentions: false
33 |         output_hidden_states: false
34 |         layer_norm_eps: 1e-12
35 | 
36 |     family:
37 |         distilbert-base-uncased:
38 |             model:
39 |                 url: "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5"
40 |                 to_insert_paths:
41 |                     - pretrained.model_path
42 |             vocab:
43 |                 url: "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
44 |                 to_insert_paths:
45 |                     - pretrained.vocab_path
46 |                     - dataset.tokenizer.vocab.filename
47 |             config:
48 |                 url: "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json"
49 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
50 |                     - pretrained.config_path
51 |                 to_replaces: # replace pretrained.config with the json content.
52 |                     - pretrained.config
53 |                 others:
54 |                     max_len:
55 |                         to_replaces:
56 |                             - dataset.tokenizer.max_len
57 |                         value: 512
58 |         distilbert-base-uncased-distilled-squad:
59 |             model:
60 |                 url: "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
61 |                 to_insert_paths:
62 |                     - pretrained.model_path
63 |             vocab:
64 |                 url: "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
65 |                 to_insert_paths:
66 |                     - pretrained.vocab_path
67 |                     - dataset.tokenizer.vocab.filename
68 |             config:
69 |                 url: "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
70 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
71 |                     - pretrained.config_path
72 |                 to_replaces: # replace pretrained.config with the json content.
73 |                     - pretrained.config
74 |                 others:
75 |                     max_len:
76 |                         to_replaces:
77 |                             - dataset.tokenizer.max_len
78 |                         value: 512


--------------------------------------------------------------------------------
/configs/2020_LSTC/DuEE_trigger.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 | #  - "../pretrain/bert_wwm.yml"
  3 |   - "../pretrain/ernie.yml"
  4 | 
  5 | model_name: bert_dgcnn_for_ner
  6 | 
  7 | model_attributes:
  8 |     hidden_dropout_prob: 0.5
  9 |     initializer_range: 0.02
 10 |     hidden_size: 1024
 11 | 
 12 | training:
 13 |   learning_rate: 3e-5
 14 |   max_epochs: 10
 15 |   batch_size: 32
 16 |   callbacks:
 17 |     early_stopping:
 18 |       switch: true
 19 |       config:
 20 |         patience: 2
 21 |     checkpoint:
 22 |       switch: true
 23 |       config:
 24 |         save_best_only: true
 25 |         verbose: 1
 26 | 
 27 | dataset:
 28 |   name: lstc_2020/DuEE_trigger
 29 |   data_dir: "./data"
 30 |   transformer: "lstc_2020/DuEE_trigger"
 31 | 
 32 |   source:
 33 |     train: "train"
 34 |     validation: "validation[:50%]"
 35 |     test: "validation[-50%:]"
 36 | 
 37 |   tokenizer:
 38 |     max_len: 256
 39 | 
 40 |   inputs:
 41 |     - name: input_ids
 42 |       column: input_ids
 43 |       type: LIST_OF_INT
 44 |       max_len: 256
 45 |     - name: token_type_ids
 46 |       column: token_type_ids
 47 |       type: LIST_OF_INT
 48 |       max_len: 256
 49 |     - name: attention_mask
 50 |       column: attention_mask
 51 |       type: LIST_OF_INT
 52 |       max_len: 256
 53 | #    - name: pos
 54 | #      column: pos
 55 | #      type: LIST_OF_CLASSLABEL
 56 | #      labels:
 57 | #        url: ""
 58 | #        name: "hanlp_pos_labels"
 59 | #      max_len: 256
 60 | 
 61 |   outputs:
 62 |     - name: output_1
 63 |       column: ner_labels
 64 |       type: LIST_OF_CLASSLABEL
 65 |       task: NER
 66 |       num: 0
 67 |       labels:
 68 |         url: "https://ai.baidu.com/file/9C92719AF96D4DDB96477BFBE1435262"
 69 |         name: "duee_trigger_ner_labels"
 70 |       loss:
 71 | #        name: sparse_categorical_crossentropy
 72 |         name: myself_crf_loss
 73 | #        config:
 74 | #          from_logits: true
 75 | #          reduction: "sum"
 76 |       metrics:
 77 | #        - name: sparse_categorical_accuracy
 78 |         - name: sparse_f1_score
 79 |           config:
 80 |             num_classes: 131
 81 |             average: "macro"
 82 |             name: "macro_f1"
 83 | #        - name: sparse_f1_score
 84 | #          config:
 85 | #            num_classes: 131
 86 | #            average: "micro"
 87 | #            name: "micro_f1"
 88 | #        - name: sparse_precision
 89 | #          config:
 90 | #            num_classes: 131
 91 | #            average: "macro"
 92 | #            name: "macro_precision"
 93 | #        - name: sparse_recall
 94 | #          config:
 95 | #            num_classes: 131
 96 | #            average: "macro"
 97 | #            name: "macro_recall"
 98 | #    - name: output_2
 99 | #      column: event_labels
100 | #      type: LIST_OF_INT
101 | #      num: 0
102 | #      weight: 0.5
103 | #      labels:
104 | #        url: "https://ai.baidu.com/file/9C92719AF96D4DDB96477BFBE1435262"
105 | #        name: "duee_event_type_labels"
106 | #      loss:
107 | #        name: sigmoid_focal_crossentropy
108 | #        config:
109 | #          from_logits: true
110 | #          reduction: "sum"
111 | #      metrics:
112 | #        - name: categorical_accuracy
113 | 
114 | pretrained:
115 |     name: ERNIE_1.0_max-len-512
116 | #    name: chinese_roberta_wwm_ext
117 | #    name: bert-base-chinese-huggingface
118 |     init_from_pretrained: true
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/aispace/layers/qa_layers/qa_simple.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-07-09 10:16
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : qa_with_impossible.py
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from aispace.layers.base_layer import BaseLayer
10 | from aispace.utils.tf_utils import masked_softmax, generate_onehot_label, mask_logits
11 | from aispace.layers.activations import ACT2FN
12 | 
13 | __all__ = [
14 |     'QALayerSimple'
15 | ]
16 | 
17 | 
18 | @BaseLayer.register("qa_simple")
19 | class QALayerSimple(tf.keras.layers.Layer):
20 |     """
21 |     QA Layer just simple!
22 |     """
23 |     def __init__(self, hidden_size, seq_len, start_n_top, end_n_top, initializer, dropout, layer_norm_eps=1e-12, **kwargs):
24 |         super(QALayerSimple, self).__init__(**kwargs)
25 |         self.seq_len = seq_len
26 |         self.start_n_top = start_n_top
27 |         self.end_n_top = end_n_top
28 |         self.initializer = initializer
29 |         self.hidden_size = hidden_size
30 |         self.dropout = dropout
31 |         self.layer_norm_eps = layer_norm_eps
32 | 
33 |     def build(self, unused_input_shapes):
34 |         # for start
35 |         self.start_project = tf.keras.layers.Dense(
36 |             1,
37 |             kernel_initializer=self.initializer,
38 |             name="start_project"
39 |         )
40 | 
41 |         # for end
42 |         self.end_project = tf.keras.layers.Dense(
43 |             1,
44 |             kernel_initializer=self.initializer,
45 |             name="end_project"
46 |         )
47 |         super(QALayerSimple, self).build(unused_input_shapes)
48 | 
49 |     def call(self, inputs, **kwargs):
50 |         seq_output, cls_output, passage_mask, start_position = inputs
51 |         is_training = kwargs.get("training", False)
52 | 
53 |         start_feature = self.start_project(seq_output)  # [b, l, h] --> [b, l, 1]
54 |         start_feature = tf.squeeze(start_feature, axis=-1)  # [b, l, 1] --> [b, l]
55 |         start_log_probs = masked_softmax(start_feature, passage_mask, is_training)  # [b, l]
56 | 
57 |         end_feature = self.end_project(seq_output)  # [b, l, h] --> [b, l, 1]
58 |         end_feature = tf.squeeze(end_feature, axis=-1)  # [b, l, 1] --> [b, l]
59 |         end_log_probs = masked_softmax(end_feature, passage_mask, is_training)  # [b, l]
60 | 
61 |         if is_training:
62 |             output = (start_log_probs, end_log_probs)
63 |         else:
64 |             start_top_log_prob, start_top_index = tf.nn.top_k(start_log_probs,
65 |                                                               self.start_n_top)  # [b, l] --> [b, k], [b, k]
66 |             end_top_log_prob, end_top_index = tf.nn.top_k(end_log_probs,
67 |                                                           self.start_n_top)  # [b, k, l] --> [b, k], [b, k]
68 | 
69 |             start_top_log_prob = tf.expand_dims(start_top_log_prob, axis=-1)
70 |             start_top_index = tf.expand_dims(tf.cast(start_top_index, dtype=tf.float32), axis=-1)
71 |             start_top_res = tf.concat([start_top_log_prob, start_top_index], axis=-1)
72 | 
73 |             end_top_log_prob = tf.expand_dims(end_top_log_prob, axis=-1)
74 |             end_top_index = tf.expand_dims(tf.cast(end_top_index, dtype=tf.float32), axis=-1)
75 |             end_top_res = tf.concat([end_top_log_prob, end_top_index], axis=-1)
76 | 
77 |             output = (start_top_res, end_top_res)
78 | 
79 |         return output


--------------------------------------------------------------------------------
/aispace/layers/optimizers/lr_multiplier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-06-24 14:52
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : lr_multiplier.py
 6 | 
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | from aispace.layers.optimizers.optimizer_wrapper import OptimizerWrapper
11 | 
12 | __all__ = [
13 |     "LRMultiplier"
14 | ]
15 | 
16 | 
17 | class LRMultiplier(OptimizerWrapper):
18 | 
19 |     def __init__(self,
20 |                  optimizer,
21 |                  multipliers,
22 |                  name='lr_multiplier',
23 |                  **kwargs):
24 |         """Initialize the optimizer wrapper.
25 |         Learning rate multiplier wrapper for optimizers.
26 |         ref: "https://pypi.org/project/keras-lr-multiplier/"
27 | 
28 |         :param optimizer: The original optimizer.
29 |         :param multipliers: A dict representing the multipliers.
30 |                             The key is the prefix of the weight to be multiplied.
31 |         :param kwargs: Arguments for parent class.
32 |         """
33 |         super(LRMultiplier, self).__init__(optimizer, name, **kwargs)
34 |         self.multipliers = multipliers
35 |         if hasattr(self._optimizer, 'learning_rate'):
36 |             self.lr_attr = 'learning_rate'
37 |         else:
38 |             self.lr_attr = 'lr'
39 | 
40 |     def _get_multiplier(self, name):
41 |         multiplier, prefix_len = 1.0, 0
42 |         for key, val in self.multipliers.items():
43 |             if name.startswith(key):
44 |                 if len(key) > prefix_len:
45 |                     prefix_len = len(key)
46 |                     multiplier = val
47 |         return multiplier
48 | 
49 |     def get_updates(self, loss, params):
50 |         if len(self.updates) > 0:
51 |             return self.updates
52 |         multiplies = {}
53 |         for param in params:
54 |             multiplier = self._get_multiplier(param.name)
55 |             if multiplier not in multiplies:
56 |                 multiplies[multiplier] = []
57 |             multiplies[multiplier].append(param)
58 | 
59 |         self.updates, self.weights = [], []
60 |         origin_lr = getattr(self, self.lr_attr)
61 |         for i, (multiplier, params) in enumerate(multiplies.items()):
62 |             lr = origin_lr
63 |             if callable(multiplier):
64 |                 lr = lr * multiplier(tf.keras.backend.cast(self._optimizer.iterations, tf.keras.backend.floatx()))
65 |             elif multiplier != 1.0:
66 |                 lr = lr * multiplier
67 |             setattr(self, self.lr_attr, lr)
68 |             with tf.keras.backend.name_scope('Group_{}'.format(i)):
69 |                 self.updates += self._optimizer.get_updates(loss, params)
70 |             print(self.multipliers, i, self._optimizer.weights)
71 |             for w in self._optimizer.weights:
72 |                 if w not in self.weights:
73 |                     self.weights.append(w)
74 |         setattr(self, self.lr_attr, origin_lr)
75 | 
76 |         return self.updates
77 | 
78 |     def apply_gradients(self, grads_and_vars, name=None):
79 | 
80 |         self._optimizer._iterations = self.iterations
81 |         return super().apply_gradients(grads_and_vars, name)
82 | 
83 |     def get_config(self):
84 |         config = {
85 |             'optimizer': tf.keras.optimizers.serialize(self._optimizer),
86 |             'multipliers': self.multipliers
87 |         }
88 |         base_config = super(LRMultiplier, self).get_config()
89 |         return dict(list(base_config.items()) + list(config.items()))
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/configs/glue_zh/tnews.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 | #  - "../base.yml"
  3 |   - "../pretrain/bert_huggingface.yml"
  4 | #  - "../pretrain/nezha.yml"
  5 | #  - "../pretrain/albert_chinese.yml"
  6 | #  - "../pretrain/electra_chinese.yml"
  7 | #  - "../pretrain/ernie.yml"
  8 | #  - "../pretrain/xlnet_chinese.yml"
  9 | 
 10 | model_name: bert_for_classification
 11 | 
 12 | model_attributes:
 13 |     hidden_dropout_prob: 0.5
 14 |     initializer_range: 0.02
 15 |     hidden_size: 1024
 16 | #    vocab_size: 21128
 17 | #    filters:
 18 | #      - 2
 19 | #      - 3
 20 | #      - 4
 21 | #    windows:
 22 | #      - 100
 23 | #      - 100
 24 | #      - 100
 25 | 
 26 | training:
 27 | #  policy:
 28 | #    name: "k-fold"
 29 | #    config:
 30 | #      k: 5
 31 |   learning_rate: 1e-5
 32 |   max_epochs: 30
 33 |   batch_size: 32
 34 | 
 35 |   optimizer:
 36 |     name: adam
 37 | 
 38 |   callbacks:
 39 |     # callback name
 40 |     early_stopping:
 41 |       switch: true
 42 |       config:
 43 |         patience: 2
 44 |     lr_finder:
 45 |       switch: true
 46 |       config:
 47 |         end_lr: 1e-3
 48 | 
 49 |   optimizer_wrappers:
 50 |     swa:
 51 |       switch: false
 52 |       config:
 53 |         start_epoch: 5
 54 |     lr_multiplier:
 55 |       switch: false
 56 |       config:
 57 |         multipliers:
 58 |           bert_for_seq_classification/bert: 0.1
 59 | 
 60 | 
 61 | dataset:
 62 |   name: glue_zh/tnews
 63 |   data_dir: "./data"
 64 |   transformer: "glue_zh/tnews"
 65 | 
 66 |   source:
 67 |     train: "train[:80%]"
 68 |     validation: "train[-20%:]"
 69 |     test: "validation"
 70 | 
 71 |   tokenizer:
 72 | #      name: bert_tokenizer
 73 | #      vocab:
 74 | #          filename: "/search/data1/yyk/data/pretrained/nezha/NEZHA-Base/vocab.txt"
 75 | #          special_tokens:
 76 | #              PAD: "[PAD]"
 77 | #              UNK: "[UNK]"
 78 | #              SEP: "[SEP]"
 79 | #              CLS: "[CLS]"
 80 | #              MASK: "[MASK]"
 81 | #      tokenize_chinese_chars: True
 82 | #      do_lower_case: True
 83 | #      do_basic_tokenize: True
 84 | #      non_split_tokens: null
 85 |       max_len: 100
 86 | 
 87 |   inputs:
 88 |     - name: input_ids
 89 |       column: input_ids
 90 |       type: LIST_OF_INT
 91 |       max_len: 100
 92 |     - name: token_type_ids
 93 |       column: token_type_ids
 94 |       type: LIST_OF_INT
 95 |       max_len: 100
 96 |     - name: attention_mask
 97 |       column: attention_mask
 98 |       type: LIST_OF_INT
 99 |       max_len: 100
100 | 
101 |   outputs:
102 |     - name: output_1
103 |       column: label
104 |       type: CLASSLABEL
105 |       num: 15
106 |       labels: ["news_story", "news_culture", "news_entertainment", "news_sports", "news_finance",
107 |                "news_house", "news_car", "news_edu", "news_tech", "news_military", "news_travel",
108 |                "news_world", "news_stock", "news_agriculture", "news_game"]
109 |       loss:
110 |         name: sparse_categorical_crossentropy
111 |         config:
112 |           from_logits: true
113 |       metrics:
114 |         - name: sparse_categorical_accuracy
115 |         - name: sparse_f1_score
116 |           config:
117 |             name: "macro_f1"
118 |             num_classes: 15
119 |             average: "macro"
120 | 
121 | pretrained:
122 | #    name: ERNIE_1.0_max-len-512
123 | #    name: chinese_xlnet_mid
124 | #    name: chinese_electra_base
125 | #    name: NEZHA-Base
126 |     name: bert-base-chinese-huggingface
127 | #    name: albert_xlarge_zh_google
128 |     init_from_pretrained: true
129 | #    config:
130 | #      use_task_id: false
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/aispace/datasets/data_transformers/idiom_transformer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2020-01-10 15:38
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : tnew_transformer.py
 6 | 
 7 | 
 8 | import os
 9 | from tqdm import tqdm
10 | import json
11 | import logging
12 | import numpy as np
13 | # import hanlp
14 | import pickle
15 | from random import random, randrange
16 | from pathlib import Path
17 | from .base_transformer import BaseTransformer
18 | from aispace.datasets import BaseTokenizer
19 | from aispace.utils.io_utils import json_dumps
20 | from aispace.utils.file_utils import default_download_dir, maybe_create_dir
21 | from aispace.utils.io_utils import maybe_download, load_from_file
22 | 
23 | __all__ = [
24 |     "IdiomGeneratorTransformer",
25 | ]
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | 
30 | @BaseTransformer.register("idiom/idiom_generator")
31 | class IdiomGeneratorTransformer(BaseTransformer):
32 |     def __init__(self, hparams, **kwargs):
33 |         super(IdiomGeneratorTransformer, self).__init__(hparams, **kwargs)
34 | 
35 |         # tokenizer
36 |         self.tokenizer = \
37 |             BaseTokenizer. \
38 |                 by_name(self._hparams.dataset.tokenizer.name) \
39 |                 (self._hparams.dataset.tokenizer)
40 | 
41 |     def transform(self, data_path, split="train"):
42 |         with open(data_path, "r") as inf:
43 |             for idx, line in enumerate(inf):
44 |                 json_obj = json.loads(line)
45 |                 gushi = json_obj['gushi']
46 |                 chenyu = json_obj['chenyu']
47 | 
48 |                 gushi = gushi.replace(chenyu, "")
49 | 
50 |                 gushi_tokens = self.tokenizer.tokenize(gushi)
51 |                 chenyu_tokens = self.tokenizer.tokenize(chenyu)
52 | 
53 |                 gushi_tokens = gushi_tokens[: self.tokenizer.max_len - len(chenyu_tokens) - 1]
54 | 
55 |                 tokens = gushi_tokens + [self.tokenizer.vocab.sep_token] + chenyu_tokens + [self.tokenizer.vocab.eod_token]
56 | 
57 |                 input_tokens = tokens[:-1]
58 |                 label_tokens = []
59 |                 for token in tokens[1:]:
60 |                     if token in self.tokenizer.vocab.token_to_id:
61 |                         label_tokens.append(token)
62 |                     else:
63 |                         label_tokens.append(self.tokenizer.vocab.unk_token)
64 | 
65 |                 attention_mask = [1] * len(input_tokens) + [0] * (self.tokenizer.max_len - len(input_tokens))
66 | 
67 |                 assert len(attention_mask) == self.tokenizer.max_len, ValueError(f"attention_mask's length is {len(attention_mask)}")
68 | 
69 |                 input_tokens += [self.tokenizer.vocab.pad_token] * (self.tokenizer.max_len - len(input_tokens))
70 |                 label_tokens += [self.tokenizer.vocab.pad_token] * (self.tokenizer.max_len - len(label_tokens))
71 | 
72 |                 assert len(input_tokens) == self.tokenizer.max_len, ValueError(f"input_tokens's length is {len(input_tokens)}")
73 |                 assert len(label_tokens) == self.tokenizer.max_len, ValueError(f"label_tokens's length is {len(label_tokens)}")
74 | 
75 |                 encode_output = self.tokenizer.encode(input_tokens)
76 | 
77 |                 feature = {
78 |                     "input_ids": encode_output["input_ids"],
79 |                     "attention_mask": attention_mask,
80 |                     "label": label_tokens,
81 |                 }
82 | 
83 |                 if idx == 0:
84 |                     print(feature)
85 |                     print(len(feature['label']))
86 |                 yield feature


--------------------------------------------------------------------------------
/aispace/models/base_pretrained.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-11-04 19:35
 3 | # @Author  : yingyuankai
 4 | # @Email   : yingyuankai@aliyun.com
 5 | # @File    : pretrained_base.py
 6 | 
 7 | import logging
 8 | from pathlib import Path
 9 | from abc import ABCMeta, abstractmethod
10 | import tensorflow as tf
11 | 
12 | from aispace.utils.hparams import Hparams
13 | from aispace.utils.file_utils import *
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | class BasePretrained(BaseModel):
19 |     __metaclass__ = ABCMeta
20 | 
21 |     def __init__(self, hparams: Hparams, **kwargs):
22 |         super(BasePretrained, self).__init__(hparams, **kwargs)
23 |         self._MODEL2URL = hparams.family
24 |         self._MODEL_NAME = hparams.name
25 |         self.cache_dir = hparams.cache_dir
26 |         # self.pretrained_model_path = self.download_checkpoint(self._MODEL_NAME, self.cache_dir)
27 | 
28 |     def download_checkpoint(self, pretrained_model_name, cache_dir=None):
29 |         r"""Download the specified pre-trained checkpoint, and return the
30 |         directory in which the checkpoint is cached.
31 | 
32 |         Args:
33 |             pretrained_model_name (str): Name of the model checkpoint.
34 |             cache_dir (str, optional): Path to the cache directory. If `None`,
35 |                 uses the default directory (user's home directory).
36 | 
37 |         Returns:
38 |             Path to the cache directory.
39 |         """
40 |         if pretrained_model_name in self._MODEL2URL:
41 |             download_path = self._MODEL2URL[pretrained_model_name]
42 |         else:
43 |             raise ValueError(
44 |                 "Pre-trained model not found: {}".format(pretrained_model_name))
45 | 
46 |         if cache_dir is None:
47 |             cache_path = default_download_dir(self._MODEL_NAME)
48 |         else:
49 |             cache_path = Path(cache_dir)
50 |         cache_path = cache_path / pretrained_model_name
51 | 
52 |         if not cache_path.exists():
53 |             if isinstance(download_path, list):
54 |                 for path in download_path:
55 |                     maybe_download(path, str(cache_path))
56 |             else:
57 |                 filename = download_path.split('/')[-1]
58 |                 maybe_download(download_path, str(cache_path), extract=True)
59 |                 folder = None
60 |                 for file in cache_path.iterdir():
61 |                     if file.is_dir():
62 |                         folder = file
63 |                 assert folder is not None
64 |                 (cache_path / filename).unlink()
65 |                 for file in folder.iterdir():
66 |                     file.rename(file.parents[1] / file.name)
67 |                 folder.rmdir()
68 |             print("Pre-trained {} checkpoint {} cached to {}".format(
69 |                 self._MODEL_NAME, pretrained_model_name, cache_path))
70 |         else:
71 |             print("Using cached pre-trained {} checkpoint from {}.".format(
72 |                 self._MODEL_NAME, cache_path))
73 | 
74 |         return str(cache_path)
75 | 
76 | 
77 |     @abstractmethod
78 |     def _init_from_checkpoint(self, pretrained_model_name, cache_dir, scope_name, **kwargs):
79 |         r"""Initialize model parameters from weights stored in the pre-trained
80 |         checkpoint.
81 | 
82 |         Args:
83 |             pretrained_model_name (str): Name of the pre-trained model.
84 |             cache_dir (str): Path to the cache directory.
85 |             scope_name: Variable scope.
86 |             **kwargs: Additional arguments for specific models.
87 |         """
88 |         raise NotImplementedError


--------------------------------------------------------------------------------
/aispace/layers/losses/focal_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2020-05-07 17:34
  3 | # @Author  : yingyuankai
  4 | # @Email   : yingyuankai@aliyun.com
  5 | # @File    : focal_loss.py
  6 | 
  7 | 
  8 | import tensorflow as tf
  9 | import tensorflow.keras.backend as K
 10 | # from tensorflow_addons import FloatTensorLike, TensorLike
 11 | import tensorflow_addons as tfa
 12 | from aispace.utils.tf_utils import get_shape
 13 | 
 14 | __all__ = [
 15 |     "SparseSoftmaxFocalCrossEntropy"
 16 | ]
 17 | 
 18 | 
 19 | class SparseSoftmaxFocalCrossEntropy(tf.keras.losses.Loss):
 20 |     """Implements the sparse focal loss function.
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         from_logits: bool = False,
 26 |         alpha=0.25,
 27 |         gamma=2.0,
 28 |         reduction: str = tf.keras.losses.Reduction.NONE,
 29 |         name: str = "sparse_softmax_focal_crossentropy",
 30 |     ):
 31 |         super().__init__(name=name, reduction=reduction)
 32 | 
 33 |         self.from_logits = from_logits
 34 |         self.alpha = alpha
 35 |         self.gamma = gamma
 36 | 
 37 |     def call(self, y_true, y_pred):
 38 |         y_pred_shape = get_shape(y_pred)
 39 |         y_true = tf.cast(y_true, tf.int32)
 40 |         y_true = tf.one_hot(y_true, y_pred_shape[-1])
 41 |         y_true = tf.reshape(y_true, y_pred_shape)
 42 | 
 43 |         loss = softmax_focal_crossentropy(
 44 |             y_true,
 45 |             y_pred,
 46 |             alpha=self.alpha,
 47 |             gamma=self.gamma,
 48 |             from_logits=self.from_logits,
 49 |         )
 50 | 
 51 |         # to be checked todo
 52 |         # return tf.reduce_mean(loss)
 53 |         return tf.reduce_mean(loss)
 54 | 
 55 |     def get_config(self):
 56 |         config = {
 57 |             "from_logits": self.from_logits,
 58 |             "alpha": self.alpha,
 59 |             "gamma": self.gamma,
 60 |         }
 61 |         base_config = super().get_config()
 62 |         return {**base_config, **config}
 63 | 
 64 | 
 65 | def softmax_focal_crossentropy(
 66 |     y_true,
 67 |     y_pred,
 68 |     alpha=0.25,
 69 |     gamma=2.0,
 70 |     from_logits: bool = False,
 71 | ) -> tf.Tensor:
 72 |     """
 73 |     Args
 74 |         y_true: true targets tensor.
 75 |         y_pred: predictions tensor.
 76 |         alpha: balancing factor.
 77 |         gamma: modulating factor.
 78 | 
 79 |     Returns:
 80 |         Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
 81 |         same shape as `y_true`; otherwise, it is scalar.
 82 |     """
 83 |     if gamma and gamma < 0:
 84 |         raise ValueError("Value of gamma should be greater than or equal to zero")
 85 | 
 86 |     y_pred = tf.convert_to_tensor(y_pred)
 87 |     y_true = tf.convert_to_tensor(y_true, dtype=y_pred.dtype)
 88 | 
 89 |     # Get the cross_entropy for each entry
 90 |     ce = K.binary_crossentropy(y_true, y_pred, from_logits=from_logits)
 91 | 
 92 |     # If logits are provided then convert the predictions into probabilities
 93 |     if from_logits:
 94 |         pred_prob = tf.nn.softmax(y_pred)
 95 |     else:
 96 |         pred_prob = y_pred
 97 | 
 98 |     p_t = (y_true * pred_prob) + ((1 - y_true) * (1 - pred_prob))
 99 |     alpha_factor = 1.0
100 |     modulating_factor = 1.0
101 | 
102 |     if alpha:
103 |         alpha = tf.convert_to_tensor(alpha, dtype=K.floatx())
104 |         alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
105 | 
106 |     if gamma:
107 |         gamma = tf.convert_to_tensor(gamma, dtype=K.floatx())
108 |         modulating_factor = tf.pow((1.0 - p_t), gamma)
109 | 
110 |     # compute the final loss and return
111 |     return tf.reduce_sum(alpha_factor * modulating_factor * ce, axis=-1)


--------------------------------------------------------------------------------
/configs/pretrain/xlnet_chinese.yml:
--------------------------------------------------------------------------------
 1 | # config for xlnet chinese
 2 | includes:
 3 |   - "../base.yml"
 4 | 
 5 | dataset:
 6 |     tokenizer:
 7 |         name: xlnet_tokenizer
 8 |         vocab:
 9 |             filename: null
10 |             special_tokens:
11 |                 PAD: "<pad>"
12 |                 UNK: "<unk>"
13 |                 SEP: "<sep>"
14 |                 CLS: "<cls>"
15 |                 MASK: "<mask>"
16 |                 EOD: "<eod>"
17 |                 EOP: "<eop>"
18 |         tokenize_chinese_chars: True
19 |         do_lower_case: False
20 |         remove_space: True
21 |         keep_accents: False
22 |         max_len: 512
23 | 
24 | pretrained:
25 |     norm_name: xlnet
26 |     name: chinese_xlnet_base
27 |     adapter: tf_huggingface_xlnet_adapter
28 |     force_download: false
29 |     init_from_pretrained: true
30 |     cache_dir: /search/data1/yyk/data/pretrained/xlnet  # your path to save models
31 |     model_path: null
32 |     vocab_path: null
33 |     config_path: null
34 |     config:
35 |         output_attentions: false
36 |         output_hidden_states: false
37 |         layer_norm_eps: 1e-12
38 |         hidden_size: 1024
39 |         attn_type: bi
40 |         bi_data: false
41 |         clamp_len: -1
42 |         dropout: 0.1
43 |         end_n_top: 5
44 |         finetuning_task: null
45 |         initializer_range: 0.02
46 |         mem_len: null
47 |         num_labels: 2
48 |         reuse_len: null
49 |     ref: https://github.com/ymcui/Chinese-PreTrained-XLNet
50 |     family:
51 |         chinese_xlnet_base:
52 |             model:
53 |                 # your/path/to/chinese_xlnet_base
54 |                 url: /search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_base
55 |                 suffix: xlnet_model.ckpt
56 |                 to_insert_paths:
57 |                     - pretrained.model_path
58 |             vocab:
59 |                 # your/path/to/chinese_xlnet_base/spiece.model
60 |                 url: /search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_base/spiece.model
61 |                 to_insert_paths:
62 |                     - pretrained.vocab_path
63 |                     - dataset.tokenizer.vocab.filename
64 |             config:
65 |                 # your/path/to/chinese_xlnet_base/xlnet_config.json
66 |                 url: /search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_base/xlnet_config.json
67 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
68 |                     - pretrained.config_path
69 |                 to_replaces: # replace pretrained.config with the json content.
70 |                     - pretrained.config
71 |         chinese_xlnet_mid:
72 |             model:
73 |                 # your/path/to/chinese_xlnet_mid
74 |                 url: /search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_mid
75 |                 suffix: xlnet_model.ckpt
76 |                 to_insert_paths:
77 |                     - pretrained.model_path
78 |             vocab:
79 |                 # your/path/to/chinese_xlnet_mid/spiece.model
80 |                 url: /search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_mid/spiece.model
81 |                 to_insert_paths:
82 |                     - pretrained.vocab_path
83 |                     - dataset.tokenizer.vocab.filename
84 |             config:
85 |                 # your/path/to/chinese_xlnet_mid/spiece.model/xlnet_config.json
86 |                 url: /search/data1/yyk/data/pretrained/xlnet/chinese_xlnet_mid/xlnet_config.json
87 |                 to_insert_paths: # set the pretrained.config_path with saved path of this file.
88 |                     - pretrained.config_path
89 |                 to_replaces: # replace pretrained.config with the json content.
90 |                     - pretrained.config
91 | 
92 | 


--------------------------------------------------------------------------------
/docs/source/quickstart.md:
--------------------------------------------------------------------------------
  1 | # Quickstart
  2 | 
  3 | ## Training
  4 | 
  5 | ```
  6 | python -u aispace/trainer.py \
  7 |     --schedule train_and_eval \
  8 |     --config_name CONFIG_NAME \
  9 |     --config_dir CONFIG_DIR \
 10 |     [--experiment_name EXPERIMENT_NAME] \
 11 |     [--model_name MODEL_NAME] \
 12 |     [--gpus GPUS] 
 13 | ```
 14 | 
 15 | ## Training with resumed model
 16 | 
 17 | ```
 18 | python -u aispace/trainer.py \
 19 |     --schedule train_and_eval \
 20 |     --config_name CONFIG_NAME \
 21 |     --config_dir CONFIG_DIR \
 22 |     --model_resume_path MODEL_RESUME_PATH \
 23 |     [--experiment_name EXPERIMENT_NAME] \
 24 |     [--model_name MODEL_NAME] \
 25 |     [--gpus GPUS] 
 26 | ```
 27 | 
 28 | --model_resume_path is a path to initialization model.
 29 | 
 30 | ## Average checkpoints
 31 | 
 32 | ```
 33 | python -u aispace/trainer.py \
 34 |     --schedule avg_checkpoints \
 35 |     --config_name CONFIG_NAME \
 36 |     --config_dir CONFIG_DIR \
 37 |     --prefix_or_checkpoints PREFIX_OR_CHECKPOINGS \
 38 |     [--ckpt_weights CKPT_WEIGHTS] \
 39 |     [--experiment_name EXPERIMENT_NAME] \
 40 |     [--model_name MODEL_NAME] \
 41 |     [--gpus GPUS] 
 42 | ```
 43 | 
 44 | --prefix_or_checkpoints is paths to multiple checkpoints separated by comma.
 45 | 
 46 | --ckpt_weights is weights same order as the prefix_or_checkpoints.
 47 | 
 48 | ## Deployment
 49 | 
 50 | Generate deployment files before deployment, you need to specify the model path (--model_resume_path) to be deployed like following.
 51 | 
 52 | ```
 53 | python -u aispace/trainer.py \
 54 |     --schedule deploy \
 55 |     --config_name CONFIG_NAME \
 56 |     --config_dir CONFIG_DIR \
 57 |     --model_resume_path MODEL_RESUME_PATH \
 58 |     [--experiment_name EXPERIMENT_NAME] \
 59 |     [--model_name MODEL_NAME] \
 60 |     [--gpus GPUS] 
 61 | ```
 62 | 
 63 | We use [BentoML](https://github.com/bentoml/BentoML) as deploy tool, so your must implement the ***deploy*** function in your model class.
 64 | 
 65 | ## Output file structure
 66 | 
 67 | The default output path is ***save***, which may has multiple output directories under name as:
 68 | 
 69 | ```text
 70 | {experiment_name}_{model_name}_{dataset_name}_{random_seed}_{id}
 71 | ```
 72 | 
 73 | Where ***id*** indicates the sequence number of the experiment for the same task, increasing from 0.
 74 | 
 75 | Take the text classification task as an example, the output file structure is similar to the following:
 76 | 
 77 | ```
 78 | test_bert_for_classification_119_0
 79 | ├── checkpoint                  # 1. checkpoints
 80 | │   ├── checkpoint
 81 | │   ├── ckpt_1.data-00000-of-00002
 82 | │   ├── ckpt_1.data-00001-of-00002
 83 | │   ├── ckpt_1.index
 84 | |   ...
 85 | ├── deploy                      # 2. Bentoml depolyment directory
 86 | │   └── BertTextClassificationService
 87 | │       └── 20191208180211_B6FC81
 88 | ├── hparams.json                # 3. Json file of all hyperparameters
 89 | ├── logs                        # 4. general or tensorboard log directory
 90 | │   ├── errors.log              # error log file
 91 | │   ├── info.log                # info log file
 92 | │   ├── train                
 93 | │   │   ├── events.out.tfevents.1574839601.jshd-60-31.179552.14276.v2
 94 | │   │   ├── events.out.tfevents.1574839753.jshd-60-31.profile-empty
 95 | │   └── validation
 96 | │       └── events.out.tfevents.1574839787.jshd-60-31.179552.151385.v2
 97 | ├── model_saved                 # 5. last model saved
 98 | │   ├── checkpoint
 99 | │   ├── model.data-00000-of-00002
100 | │   ├── model.data-00001-of-00002
101 | │   └── model.index
102 | └── reports                     # 6. Eval reports for every output or task
103 |     └── output_1_classlabel     # For example, text classification task
104 |         ├── confusion_matrix.txt
105 |         ├── per_class_stats.json
106 |         └── stats.json
107 | ```


--------------------------------------------------------------------------------