├── .DS_Store
├── .gitignore
├── README.md
├── conf
    ├── config.yml
    ├── lora_config_bloom.json
    └── lora_config_llama.json
├── datas
    ├── README.md
    ├── cluener
    │   ├── README.md
    │   ├── dev.json
    │   ├── dev.txt
    │   ├── json_to_text.py
    │   ├── test.json
    │   ├── train.json
    │   └── train.txt
    ├── convert_dataset.py
    ├── process_datas.py
    ├── spnre_business_chance
    │   ├── dev.json
    │   └── train.json
    └── text_to_json.py
├── experiments
    ├── .DS_Store
    ├── cartography
    │   └── BQ_roberta-wwm.pdf
    ├── configs
    │   ├── tplinker
    │   │   ├── README.md
    │   │   ├── build_datasets.py
    │   │   ├── data_config.yaml
    │   │   ├── tplinker_config.py
    │   │   ├── tplinker_eval.yaml
    │   │   ├── tplinker_plus_config.py
    │   │   └── tplinker_train.yaml
    │   └── tplinker_plus
    │   │   ├── config.py
    │   │   ├── eval_config.yaml
    │   │   └── train_config.yaml
    ├── event_extraction
    │   ├── sujianlin
    │   │   ├── eval_model.py
    │   │   ├── predict.py
    │   │   └── train_model.py
    │   └── tplinker
    │   │   └── train_and_valid.py
    ├── ner
    │   ├── cluener_globalpointer.py
    │   ├── evaluate_tplinker_ner.py
    │   ├── lear_for_ner.py
    │   ├── mrc_for_ner.py
    │   ├── run_ner_crf.py
    │   ├── run_ner_softmax.py
    │   ├── run_ner_span.py
    │   ├── run_ner_tplinker.py
    │   ├── run_ner_tplinker_v2.py
    │   └── train_globalpointer.py
    ├── others
    │   └── child_tuning_on_ner.py
    ├── qa_and_text_generation
    │   ├── finetune_cdail_gpt.py
    │   ├── finetune_cdail_gpt_2.py
    │   ├── finetune_cpm_large_2.py
    │   ├── finetune_cpm_large_accelerate.py
    │   ├── finetune_llm_chat.py
    │   ├── finetune_unilm_for_seq2seq_liadrinz.py
    │   └── finetune_unilm_for_seq2seq_yunwen.py
    ├── relation_extraction
    │   ├── casel_train.py
    │   ├── data_loader_casrel.py
    │   ├── framework.py
    │   ├── kg_globalpointer.py
    │   ├── train_ee.py
    │   ├── train_relation_extraction_by_globalpointer.py
    │   ├── train_spn4re.py
    │   ├── train_tplinker.py
    │   └── train_tplinker_plus.py
    ├── scripts
    │   ├── decode_yunwen_unilm_for_se2seq.sh
    │   ├── finetune_cpm_large_2.sh
    │   ├── finetune_cpm_large_accelerate.sh
    │   ├── finetune_liadrinz_unilm.sh
    │   ├── finetune_qwen_7b_qlora.sh
    │   ├── finetune_yunwen_unilm.sh
    │   ├── run_child_tuning_on_ner.sh
    │   ├── run_duee1_sujianlin.sh
    │   ├── run_finetune_cdail_gpt.sh
    │   ├── run_finetune_cdail_gpt_2.sh
    │   ├── run_global_pointer_for_ner.sh
    │   ├── run_ner_crf.sh
    │   ├── run_ner_softmax.sh
    │   ├── run_ner_span.sh
    │   ├── run_pair_sup_con.sh
    │   ├── run_sbert_training_dynamics.sh
    │   ├── run_spn4re.sh
    │   └── run_unsup_vascl.sh
    ├── sentence_embedding
    │   ├── run_bert_whitening.py
    │   ├── run_cross_encoder.py
    │   ├── run_cross_encoder_2.py
    │   ├── run_pair_sup_con.py
    │   ├── run_sentence_bert.py
    │   ├── run_sentence_bert_2.py
    │   ├── run_sentence_bert_accuracy.py
    │   ├── run_sentence_bert_pairscl.py
    │   ├── run_simces_2.py
    │   ├── run_sup_cosent.py
    │   ├── run_sup_cosent_accuracy.py
    │   ├── run_sup_simcse.py
    │   ├── run_unsup_consert.py
    │   ├── run_unsup_consert_2.py
    │   ├── run_unsup_consert_3.py
    │   ├── run_unsup_simcse.py
    │   ├── run_unsup_vascl.py
    │   └── train_sentence_bert_training_dynamics.py
    └── single_test
    │   ├── argument_test.py
    │   ├── bart_test.py
    │   ├── bert4keras_ee.py
    │   ├── bert4keras_model.py
    │   ├── bert_crop_model_test.py
    │   ├── chatglm_test.py
    │   ├── data_selection_for_training_dynamics.py
    │   ├── decode_yunwen_unilm_for_seq2seq.py
    │   ├── globalpointer_test_sujianlin.py
    │   ├── imagen_test.py
    │   ├── inference_cdail_gpt.py
    │   ├── inference_cpm.py
    │   ├── inference_liadrinz_unilm_for_seq2seq.py
    │   ├── inference_yunwen_unilm_for_seq2seq.py
    │   ├── paddlenlp_test.py
    │   ├── position_embedding_test.py
    │   ├── relation_extraction_globalpointer.py
    │   ├── roformer_sim_test.py
    │   ├── roformer_test.py
    │   ├── simcse_tool.py
    │   ├── task_conditional_language_model.py
    │   ├── task_seq2seq_autotitle.py
    │   ├── train_dynamics_filtering.py
    │   ├── unlim_test.py
    │   └── wobert_test.py
├── license.txt
├── nlp
    ├── __init__.py
    ├── arguments
    │   ├── __init__.py
    │   ├── data_arguments.py
    │   ├── model_arguments.py
    │   └── train_arguments.py
    ├── callback
    │   ├── __init__.py
    │   ├── adversarial.py
    │   ├── child_tuning_fisher.py
    │   ├── lr_finder.py
    │   ├── lr_scheduler.py
    │   ├── modelcheckpoint.py
    │   ├── optimizers
    │   │   ├── __init__.py
    │   │   ├── adabound.py
    │   │   ├── adafactor.py
    │   │   ├── adamw.py
    │   │   ├── child_tuning_optimizer.py
    │   │   ├── ema.py
    │   │   ├── lamb.py
    │   │   ├── lars.py
    │   │   ├── lookahead.py
    │   │   ├── nadam.py
    │   │   ├── novograd.py
    │   │   ├── planradam.py
    │   │   ├── radam.py
    │   │   ├── ralamb.py
    │   │   ├── ralars.py
    │   │   └── sgdw.py
    │   ├── progressbar.py
    │   └── trainingmonitor.py
    ├── event_extractor
    │   ├── __init__.py
    │   └── event_extractor.py
    ├── layers
    │   ├── __init__.py
    │   ├── cnn.py
    │   ├── crf.py
    │   ├── global_pointer.py
    │   ├── layer.py
    │   ├── linears.py
    │   └── position_embeddings.py
    ├── losses
    │   ├── __init__.py
    │   └── loss.py
    ├── metrics
    │   ├── __init__.py
    │   ├── bleu_metric.py
    │   ├── metric.py
    │   ├── sematic_match_metric.py
    │   ├── spn4re_metric.py
    │   ├── tplinker_metric.py
    │   └── triplet_distance_metric.py
    ├── models
    │   ├── __init__.py
    │   ├── bert_for_ee.py
    │   ├── bert_for_ee_tplinker.py
    │   ├── bert_for_ner.py
    │   ├── bert_for_relation_extraction.py
    │   ├── bert_model.py
    │   ├── bert_spn4re.py
    │   ├── bertcrop.py
    │   ├── distill_model.py
    │   ├── generate_model.py
    │   ├── idcnn_for_crf.py
    │   ├── model_util.py
    │   ├── nezha.py
    │   ├── sentence_embedding_models.py
    │   ├── tplinker_plus_for_ner.py
    │   ├── transformer.py
    │   ├── unilm_model_liadrinz.py
    │   └── unilm_model_yunwen.py
    ├── processors
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── ee_seq.py
    │   ├── ee_span.py
    │   ├── global_pointer_processor.py
    │   ├── ner_seq.py
    │   ├── ner_span.py
    │   ├── predict_process.py
    │   ├── preprocess.py
    │   ├── semantic_match_preprocessor.py
    │   ├── spn4ner_processor.py
    │   ├── unilm_liadrinz_processor.py
    │   ├── unlim_yunwen_preprocessor.py
    │   ├── utils_ee.py
    │   └── utils_ner.py
    ├── sentence_transformers
    │   ├── LoggingHandler.py
    │   ├── SentenceTransformer.py
    │   ├── __init__.py
    │   ├── cross_encoder
    │   │   ├── CrossEncoder.py
    │   │   ├── __init__.py
    │   │   └── evaluation
    │   │   │   ├── CEBinaryClassificationEvaluator.py
    │   │   │   ├── CECorrelationEvaluator.py
    │   │   │   ├── CESoftmaxAccuracyEvaluator.py
    │   │   │   └── __init__.py
    │   ├── datasets
    │   │   ├── EncodeDataset.py
    │   │   ├── ParallelSentencesDataset.py
    │   │   ├── SentenceLabelDataset.py
    │   │   ├── SentencesDataset.py
    │   │   ├── __init__.py
    │   │   └── sampler
    │   │   │   ├── LabelSampler.py
    │   │   │   └── __init__.py
    │   ├── evaluation
    │   │   ├── BinaryClassificationEvaluator.py
    │   │   ├── EmbeddingSimilarityEvaluator.py
    │   │   ├── InformationRetrievalEvaluator.py
    │   │   ├── LabelAccuracyEvaluator.py
    │   │   ├── MSEEvaluator.py
    │   │   ├── MSEEvaluatorFromDataFrame.py
    │   │   ├── ParaphraseMiningEvaluator.py
    │   │   ├── SentenceEvaluator.py
    │   │   ├── SequentialEvaluator.py
    │   │   ├── SimilarityFunction.py
    │   │   ├── TranslationEvaluator.py
    │   │   ├── TripletEvaluator.py
    │   │   └── __init__.py
    │   ├── losses
    │   │   ├── AdvCLSoftmaxLoss.py
    │   │   ├── AdvCLSoftmaxLoss_refactoring.py
    │   │   ├── AdvCLSoftmaxLoss_single_stream_backup.py
    │   │   ├── AdvSimSiamLoss.py
    │   │   ├── BatchAllTripletLoss.py
    │   │   ├── BatchHardSoftMarginTripletLoss.py
    │   │   ├── BatchHardTripletLoss.py
    │   │   ├── BatchSemiHardTripletLoss.py
    │   │   ├── ContrastiveLoss.py
    │   │   ├── CosineSimilarityLoss.py
    │   │   ├── MSELoss.py
    │   │   ├── MegaBatchMarginLoss.py
    │   │   ├── MultipleNegativesRankingLoss.py
    │   │   ├── OnlineContrastiveLoss.py
    │   │   ├── SimCLRLoss.py
    │   │   ├── SimSiamLoss.py
    │   │   ├── SoftmaxLoss.py
    │   │   ├── TripletLoss.py
    │   │   └── __init__.py
    │   ├── models
    │   │   ├── ALBERT.py
    │   │   ├── BERT.py
    │   │   ├── BoW.py
    │   │   ├── CNN.py
    │   │   ├── CamemBERT.py
    │   │   ├── Dense.py
    │   │   ├── DistilBERT.py
    │   │   ├── LSTM.py
    │   │   ├── MLP3.py
    │   │   ├── Normalize.py
    │   │   ├── Pooling.py
    │   │   ├── RoBERTa.py
    │   │   ├── T5.py
    │   │   ├── Transformer.py
    │   │   ├── WKPooling.py
    │   │   ├── WeightedLayerPooling.py
    │   │   ├── WordEmbeddings.py
    │   │   ├── WordWeights.py
    │   │   ├── XLMRoBERTa.py
    │   │   ├── XLNet.py
    │   │   ├── __init__.py
    │   │   └── tokenizer
    │   │   │   ├── PhraseTokenizer.py
    │   │   │   ├── WhitespaceTokenizer.py
    │   │   │   ├── WordTokenizer.py
    │   │   │   └── __init__.py
    │   ├── readers
    │   │   ├── InputExample.py
    │   │   ├── LabelSentenceReader.py
    │   │   ├── NLIDataReader.py
    │   │   ├── PairedFilesReader.py
    │   │   ├── STSDataReader.py
    │   │   ├── TripletReader.py
    │   │   └── __init__.py
    │   └── util.py
    ├── tools
    │   ├── __init__.py
    │   ├── accelerate_tracker.py
    │   ├── common.py
    │   ├── convert_nezha_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_tf_to_pytorch.py
    │   ├── dataloader.py
    │   ├── file_util.py
    │   ├── format_conv.py
    │   ├── path.py
    │   ├── plot.py
    │   ├── pytorch_optimization.py
    │   └── utils.py
    ├── trainers
    │   ├── ChildTuningD.py
    │   ├── ChildTuningF.py
    │   └── __init__.py
    └── utils
    │   ├── __init__.py
    │   ├── bert_or_thesues_repalcement_scheduler.py
    │   ├── configuration_unilm.py
    │   ├── ee_arguments.py
    │   ├── enums.py
    │   ├── errors.py
    │   ├── factory.py
    │   ├── functions.py
    │   ├── generate_util.py
    │   ├── log_handler.py
    │   ├── official_tokenization.py
    │   ├── optimization.py
    │   ├── selection_utils.py
    │   ├── taggers.py
    │   ├── tokenization_unilm.py
    │   ├── tokenizers.py
    │   ├── tplinker_plus_ner_util.py
    │   ├── tplinker_plus_utils.py
    │   ├── tplinker_utils.py
    │   ├── util.py
    │   ├── vat_utils.py
    │   ├── whitening_utils.py
    │   └── wobert_tokenization.py
└── requirements.txt


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/**
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # test module 个人习惯，测试目录去掉
 10 | /test/
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | .idea/
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | workspace.xml
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | .idea/workspace.xml
 96 | 
 97 | # Environments
 98 | .env
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | .dmypy.json
119 | dmypy.json
120 | 
121 | # Pyre type checker
122 | .pyre/
123 | datas/kg/
124 | datas/news2/
125 | datas/ner/
126 | datas/spn4re/
127 | datas/tplinker/
128 | .DS_Store/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## NLP相关的实验
 2 | 本项目是自己做的nlp相关的实验，以及收集整理的一些方法。  
 3 | 暂时包括命名实体识别、实体关系抽取、事件抽取、语义匹配，以后会添加其它实验，例如分类、文本生成、问答系统等。
 4 | 
 5 | ## 命名实体识别
 6 | 相关实验在`experiments/ner`，运行脚本`experiments/scripts`
 7 | 
 8 | ## 关系抽取
 9 | 相关实验在`experiments/relation_extraction`，运行脚本`experiments/scripts`
10 | 
11 | ## 事件抽取
12 | 相关实验在`experiments/event_extraction`
13 | 
14 | ## 语义匹配
15 | 相关实验在`experiments/sentence_embedding`
16 | 
17 | ## 文本生成模型和LLM模型微调
18 | 相关实验在`experiments/qa_and_text_generation`
19 | 


--------------------------------------------------------------------------------
/conf/config.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/conf/config.yml


--------------------------------------------------------------------------------
/conf/lora_config_bloom.json:
--------------------------------------------------------------------------------
1 | {
2 |     "lora_r": 16,
3 |     "lora_alpha": 32,
4 |     "lora_dropout": 0.05,
5 |     "lora_target_modules": [
6 |         "query_key_value",
7 |         "mlp"
8 |     ]
9 | }


--------------------------------------------------------------------------------
/conf/lora_config_llama.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lora_r": 16,
 3 |     "lora_alpha": 32,
 4 |     "lora_dropout": 0.05,
 5 |     "lora_target_modules": [
 6 |         "q_proj",
 7 |         "k_proj",
 8 |         "v_proj",
 9 |         "o_proj",
10 |         "down_proj",
11 |         "gate_proj",
12 |         "up_proj"
13 |     ]
14 | }


--------------------------------------------------------------------------------
/datas/cluener/README.md:
--------------------------------------------------------------------------------
 1 | CLUENER 细粒度命名实体识别
 2 | 
 3 | 数据分为10个标签类别，分别为: 
 4 | 地址（address），
 5 | 书名（book），
 6 | 公司（company），
 7 | 游戏（game），
 8 | 政府（goverment），
 9 | 电影（movie），
10 | 姓名（name），
11 | 组织机构（organization），
12 | 职位（position），
13 | 景点（scene）
14 | 
15 | 数据详细介绍、基线模型和效果测评，见 https://github.com/CLUEbenchmark/CLUENER
16 | 
17 | 技术讨论或问题，请项目中提issue或PR，或发送电子邮件到 ChineseGLUE@163.com
18 | 
19 | 测试集上SOTA效果见榜单：www.CLUEbenchmark.com


--------------------------------------------------------------------------------
/datas/cluener/json_to_text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: BERT-NER-Pytorch
 5 |     File Name: json_to_text
 6 |     Author: czh
 7 |     Create Date: 2021/6/24
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import json
13 | 
14 | 
15 | file_names = ['train.json', 'dev.json']
16 | for name in file_names:
17 |     print("processing the file {}".format(name))
18 |     prefix = name.split('.')[0]
19 |     with open(name, encoding="utf-8") as fr, open(prefix+'.txt', 'w', encoding='utf8') as fw:
20 |         for line in fr:
21 |             line = line.strip()
22 |             if not line:
23 |                 continue
24 |             data = json.loads(line)
25 |             text = data['text']
26 |             tokens = list(text)
27 |             labels = ['O' for _ in tokens]
28 |             try:
29 |                 if data.get('label'):
30 |                     for label, dic in data['label'].items():
31 |                         for token, lsts in dic.items():
32 |                             for lst in lsts:
33 |                                 labels[lst[0]] = "B-"+label
34 |                                 labels[lst[0]+1:lst[-1]+1] = ["I-"+label]*(lst[-1]-lst[0])
35 |             except Exception as e:
36 |                 print(data)
37 |                 raise e
38 |             for j,t in enumerate(tokens):
39 |                 fw.write(t+' '+labels[j]+'\n')
40 |             fw.write('\n')
41 |     print("has processed the file {}".format(name))
42 | 


--------------------------------------------------------------------------------
/datas/text_to_json.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: text_to_json
 6 |     Author: czh
 7 |     Create Date: 2021/8/19
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | # 格式转换为{"text": "郑阿姨就赶到文汇路排队拿钱，希望能将缴纳的一万余元学费拿回来，顺便找校方或者教委要个说法。", "label": {"address": {"文汇路": [[6, 8]]}}}
13 | import codecs
14 | import json
15 | import numpy as np
16 | from nlp.processors import get_entities
17 | 
18 | data_file = "ner/"
19 | 
20 | 
21 | def trans_data(data_type):
22 |     file_name = data_file + f"{data_type}.txt"
23 |     lines = []
24 |     with codecs.open(file_name, encoding='utf8') as fr:
25 |         words = []
26 |         labels = []
27 |         for line in fr:
28 |             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
29 |                 if words:
30 |                     lines.append({"text": "".join(words), "labels": labels})
31 |                     words = []
32 |                     labels = []
33 |             else:
34 |                 splits = line.split(" ")
35 |                 words.append(splits[0])
36 |                 if len(splits) > 1:
37 |                     labels.append(splits[-1].replace("\n", ""))
38 |                 else:
39 |                     # Examples could have no label for mode = "test"
40 |                     labels.append("O")
41 |         if words:
42 |             lines.append({"text": "".join(words), "labels": labels})
43 |     results = []
44 |     for item in lines:
45 |         text = item["text"]
46 |         labels = item["labels"]
47 |         subjects = get_entities(labels, id2label=None, markup='bios')
48 |         label_dict = {}
49 |         for subject in subjects:
50 |             label = subject[0]
51 |             start = subject[1]
52 |             end = subject[2]
53 |             word = text[start: end+1]
54 |             if label not in label_dict:
55 |                 label_dict[label] = {}
56 |             if word not in label_dict[label]:
57 |                 label_dict[label][word] = []
58 |             label_dict[label][word].append([start, end])
59 |         results.append({"text": text, "label": label_dict})
60 | 
61 |     return results
62 | 
63 | 
64 | def save_datas(datas, data_type):
65 |     file_name = data_file + f"{data_type}.json"
66 |     with codecs.open(file_name, 'w', encoding='utf8') as fw:
67 |         for line in datas:
68 |             line = json.dumps(line, ensure_ascii=False)
69 |             fw.write(line + '\n')
70 | 
71 | 
72 | def main():
73 |     train_datas = trans_data("train")
74 |     dev_datas = trans_data("dev")
75 |     test_datas = trans_data("test")
76 | 
77 |     all_datas = train_datas + dev_datas + test_datas
78 |     np.random.shuffle(all_datas)
79 | 
80 |     num = len(all_datas)
81 |     print("total num: ", num)
82 |     train_num = int(num * 0.8)
83 |     train_datas = all_datas[:train_num]
84 |     dev_datas = all_datas[train_num: train_num+int(num*0.1)]
85 |     test_datas = all_datas[train_num+int(num*0.1):]
86 | 
87 |     print("train num: ", len(train_datas))
88 |     print("dev num: ", len(dev_datas))
89 |     print("test num: ", len(test_datas))
90 | 
91 |     save_datas(train_datas, "train")
92 |     save_datas(dev_datas, "dev")
93 |     save_datas(test_datas, "test")
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/experiments/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/experiments/.DS_Store


--------------------------------------------------------------------------------
/experiments/cartography/BQ_roberta-wwm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/experiments/cartography/BQ_roberta-wwm.pdf


--------------------------------------------------------------------------------
/experiments/configs/tplinker/README.md:
--------------------------------------------------------------------------------
1 | 参考：https://github.com/131250208/TPlinker-joint-extraction


--------------------------------------------------------------------------------
/experiments/configs/tplinker/data_config.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: NEWS # nyt_star, nyt, webnlg_star, webnlg, ace05_lu
 2 | data_in_dir: ../datas
 3 | ori_data_format: casrel # casrel (webnlg_star, nyt_star), etl_span (webnlg), raw_nyt (nyt), tplinker (see readme)
 4 | 
 5 | # if build data for BERT, use these 3 lines and comment the following 2 lines.
 6 | encoder: BERT
 7 | bert_path: hfl/chinese-bert-wwm-ext #  chinese-bert-wwm-ext-hit, bert-base-cased
 8 | data_out_dir: ../datas/data4bert
 9 | 
10 | # # if build data for BiLSTM, use these 2 lines and comment above 3 lines.
11 | # encoder: BiLSTM
12 | # data_out_dir: ../data4bilstm
13 | 
14 | # if only reproduce the results, no need to change the args below.
15 | # separate_char_by_white: e.g. "$%sdkn839," -> "$% sdkn839 ," , will make original char spans invalid
16 | # add_char_span: set add_char_span to false if it already exists
17 | # ignore_subword: when adding character level spans, match words with whitespace around: " word ", to avoid subword match, set false for chinese
18 | # check_tok_span: check whether there is any error with token spans, if there is, print the unmatch info
19 | add_char_span: true
20 | ignore_subword: false
21 | separate_char_by_white: false
22 | check_tok_span: true


--------------------------------------------------------------------------------
/experiments/configs/tplinker/tplinker_eval.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: webnlg_single
 2 | model_state_dict_dir: ./wandb
 3 | run_ids:
 4 |  - 3k0y4z53
 5 | 
 6 | last_k_model: 1
 7 | 
 8 | 
 9 | test_data: "*test*.json"
10 | rel2id: rel2id.json
11 | 
12 | device_num: 1
13 | 
14 | # encoder: BERT
15 | # data_home: ../data4bert
16 | # bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased
17 | 
18 | # for BiLSTM
19 | encoder: BiLSTM
20 | enc_hidden_size: 300
21 | dec_hidden_size: 600
22 | emb_dropout: 0.1
23 | rnn_dropout: 0.1
24 | word_embedding_dim: 300
25 | token2idx: token2idx.json
26 | data_home: ../data4bilstm
27 | 
28 | batch_size: 32
29 | force_split: false
30 | max_test_seq_len: 512
31 | sliding_len: 20
32 | match_pattern: only_head_text
33 | shaking_type: cat
34 | # distance emb, ent_add_dist and rel_add_dist are valid only if dist_emb_size != -1
35 | dist_emb_size: 512
36 | ent_add_dist: true
37 | rel_add_dist: true
38 | 
39 | # results
40 | save_res: false
41 | save_res_dir: ../results
42 | # score: set true only when test set tagged
43 | score: true


--------------------------------------------------------------------------------
/experiments/configs/tplinker/tplinker_train.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: nyt_single
 2 | run_name: TP1+Cat+BE
 3 | 
 4 | train_data: train_data.json
 5 | valid_data: valid_data.json
 6 | rel2id: rel2id.json
 7 | 
 8 | device_num: 1
 9 | 
10 | # set logger
11 | # if use default logger, must provide a log path and a path to save model, if use wandb, model state will be upload to the cloud
12 | # logger: wandb # wandb, default
13 | 
14 | logger: default
15 | log_path: ./default.log
16 | path_to_save_model: ./model_state
17 | 
18 | encoder: BERT
19 | data_home: ../data4bert
20 | bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased
21 | 
22 | # encoder: BiLSTM
23 | # token2idx: token2idx.json
24 | # data_home: ../data4bilstm
25 | # pretrained_word_embedding_path: ../pretrained_word_emb/glove_300_nyt.emb
26 | 
27 | hyper_parameters:
28 |  batch_size: 24
29 |  epochs: 200
30 |  lr: 5e-5
31 |  seed: 2333
32 |  log_interval: 10
33 |  max_seq_len: 100
34 |  sliding_len: 20
35 |  loss_weight_recover_steps: 10000
36 |  shaking_type: cat
37 |  # distance emb, ent_add_dist and rel_add_dist are valid only if dist_emb_size != -1
38 |  dist_emb_size: -1
39 |  ent_add_dist: false
40 |  rel_add_dist: false
41 |  match_pattern: only_head_text
42 | 
43 |  # CosineAnnealingWarmRestarts
44 |  scheduler: CAWR # Step
45 |  T_mult: 1
46 |  rewarm_epoch_num: 2
47 | 
48 | #  # StepLR
49 | #  scheduler: Step
50 | #  decay_rate: 0.99
51 | #  decay_steps: 100
52 | 
53 | #  # for BiLSTM
54 | #  enc_hidden_size: 300
55 | #  dec_hidden_size: 600
56 | #  emb_dropout: 0.1
57 | #  rnn_dropout: 0.1
58 | #  word_embedding_dim: 300
59 | 
60 | # when to save the model state dict
61 | f1_2_save: 0
62 | # whether train from scratch
63 | fr_scratch: true
64 | # note
65 | note: start from scratch
66 | # if not fr scratch, set a model_state_dict
67 | model_state_dict_path: stake


--------------------------------------------------------------------------------
/experiments/configs/tplinker_plus/eval_config.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: covid19_rel_lianxiangjia
 2 | model_state_dict_dir: ./wandb
 3 | run_ids:
 4 | # no dist
 5 | #  - 1w4mk6mr
 6 | #  - 3pdl0yv9
 7 | #  - e48u3t8g
 8 | #  - 14ooykvx
 9 | #  - 3tasch7j
10 | # + dist emb
11 |  - 2s5jq1ho
12 |  - llvea66v
13 |  - 3mjun6d5
14 |  - 31hmg1nh
15 |  - 3r96671x
16 | 
17 | last_k_model: 1
18 | 
19 | data_home: ../data4bert
20 | test_data: "*test*.json"
21 | rel2id: rel2id.json
22 | 
23 | device_num: 1
24 | 
25 | encoder: BERT
26 | bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased
27 | 
28 | # # for BiLSTM
29 | # enc_hidden_size: 128
30 | # dec_hidden_size: 256
31 | # emb_dropout: 0.1
32 | # rnn_dropout: 0.1
33 | # word_embedding_dim: 100
34 | # token2idx: token2idx.json
35 | # pretrained_word_embedding_path: ../pretrained_word_emb/glove_100_webnlg.emb
36 | 
37 | batch_size: 16
38 | force_split: false
39 | max_test_seq_len: 200
40 | sliding_len: 50
41 | shaking_type: cln
42 | dist_emb_size: 512
43 | tok_pair_sample_rate: 1
44 | correct: whole_text
45 | 
46 | # results
47 | save_res: true
48 | save_res_dir: ../results
49 | # score: set true only when test set tagged
50 | score: false
51 | 


--------------------------------------------------------------------------------
/experiments/configs/tplinker_plus/train_config.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: nyt
 2 | run_name: TP2+Cat+Dist+BiLSTM
 3 | 
 4 | train_data: train_data.json
 5 | valid_data: valid_data.json
 6 | rel2id: rel2id.json
 7 | 
 8 | device_num: 0
 9 | 
10 | # set logger
11 | # if use default logger, must provide a log path and a path to save model, if use wandb, model state will be upload to the cloud
12 | logger: wandb # wandb, default
13 | 
14 | # logger: default
15 | # log_path: ./default.log
16 | # path_to_save_model: ./model_state
17 | 
18 | # encoder: BERT
19 | # data_home: ../data4bert
20 | # bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased
21 | 
22 | encoder: BiLSTM
23 | token2idx: token2idx.json
24 | data_home: ../data4bilstm
25 | pretrained_word_embedding_path: ../pretrained_word_emb/glove_300_nyt.emb
26 | 
27 | hyper_parameters:
28 |  batch_size: 24
29 |  epochs: 200
30 |  lr: 1e-3
31 |  seed: 2333
32 |  log_interval: 10
33 |  max_seq_len: 100
34 |  sliding_len: 20
35 |  shaking_type: cat
36 |  dist_emb_size: 512
37 |  tok_pair_sample_rate: 1
38 | 
39 |  # CosineAnnealingWarmRestarts
40 |  scheduler: CAWR # Step
41 |  T_mult: 1
42 |  rewarm_epoch_num: 2
43 | 
44 | #  # StepLR
45 | #  scheduler: Step
46 | #  decay_rate: 0.99
47 | #  decay_steps: 100
48 | 
49 | #  scheduler: ReduceLROnPlateau
50 | 
51 | #  # for BiLSTM
52 | #  enc_hidden_size: 128
53 | #  dec_hidden_size: 256
54 | #  emb_dropout: 0.1
55 | #  rnn_dropout: 0.1
56 | #  word_embedding_dim: 100
57 | 
58 | # whether train from scratch
59 | fr_scratch: true
60 | note: start from scratch
61 | # when to save the model state dict
62 | f1_2_save: 0.0
63 | 
64 | # if not, give a model_state_dict
65 | model_state_dict_path: stake


--------------------------------------------------------------------------------
/experiments/event_extraction/sujianlin/eval_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: EventExtraction
 5 |     File Name: eval_model
 6 |     Author: czh
 7 |     Create Date: 2021/9/16
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from nlp.event_extractor.event_extractor import EventExtractor
13 | from nlp.utils.ee_arguments import DataAndTrainArguments
14 | 
15 | config = {
16 |     'task_name': 'ee',  # ee
17 |     'data_dir': '../data/normal_data/news2',
18 |     'model_type': 'bert',  # bert, nezha
19 |     'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',  # nezha-base-wwm
20 |     'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录
21 |     'do_lower_case': False,  # 主要是tokenize时是否将大写转为小写
22 |     'use_lstm': False,  # 默认为False, 表示模型结构为bert_crf
23 |     'no_cuda': False,  # 是否使用GPU。默认为False, 表示只使用CPU
24 |     'eval_max_seq_length': 128,  # 默认为512
25 |     'per_gpu_eval_batch_size': 8,
26 |     'cuda_number': '0',   # '0,1,2,3' 使用GPU时需指定GPU卡号
27 | }
28 | 
29 | args = DataAndTrainArguments(**config) # noqa
30 | extractor = EventExtractor(args)
31 | 
32 | # evaluate all checkpoints file for the dev datasets
33 | # extractor.evaluate(eval_all_checkpoints=True)
34 | 
35 | # only evaluate best model for the dev datasets
36 | # extractor.evaluate()
37 | 
38 | # evaluate all checkpoints file for the test datasets, and the test datasets sample must labeled
39 | # extractor.evaluate(data_type='test', eval_all_checkpoints=True)
40 | 
41 | # only evaluate best model for the test datasets, and the test datasets sample must labeled
42 | extractor.evaluate(data_type='test')
43 | 


--------------------------------------------------------------------------------
/experiments/event_extraction/sujianlin/predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: EventExtraction
 5 |     File Name: predict_raw_text
 6 |     Author: czh
 7 |     Create Date: 2021/9/16
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import json
13 | 
14 | from nlp.event_extractor.event_extractor import EventExtractor
15 | from nlp.utils.ee_arguments import DataAndTrainArguments
16 | 
17 | config = {
18 |     'task_name': 'ee',
19 |     'model_type': 'bert',
20 |     'use_lstm': True,  # 默认是False
21 |     'eval_max_seq_length': 512,
22 | }
23 | 
24 | args = DataAndTrainArguments(**config)  # noqa
25 | extractor = EventExtractor(args, state='pred', model_path='../data/model')
26 | 
27 | # data_type: 只能是'test'，或者None。若为test则表示在测试数据集上预测
28 | # input_texts: 若不为空，则表示是预测新的数据
29 | # pred_output_dir: 若不为空，则表示将预测结果写入指定位置保存，可以是目录，也可以是文件
30 | 
31 | # 表示在测试数据集上预测, 不保存预测结果
32 | # for res in extractor.predict(data_type='test'):
33 | #     print(res)
34 | 
35 | # 表示在测试数据集上预测, 保存预测结果
36 | # for res in extractor.predict(data_type='test', pred_output_dir="../data/output/bert"):
37 | #     print(res)
38 | 
39 | # 表示预测raw text, raw text可以是str, List[str]
40 | # texts = "博盛医疗完成Pre-A轮融资澳银资本重点参与"
41 | texts = ["博盛医疗完成Pre-A轮融资澳银资本重点参与",
42 |          "百炼智能完成A轮一亿元融资，由今日头条领投"]
43 | for res in extractor.predict(input_texts=texts):
44 |     print(json.dumps(res, ensure_ascii=False, indent=2))
45 | 


--------------------------------------------------------------------------------
/experiments/event_extraction/sujianlin/train_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: EventExtraction
 5 |     File Name: train_model
 6 |     Author: czh
 7 |     Create Date: 2021/9/15
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from nlp.event_extractor.event_extractor import EventExtractor
13 | from nlp.utils.ee_arguments import DataAndTrainArguments
14 | 
15 | config = {
16 |     'task_name': 'ner',  # ner
17 |     'data_dir': '../data/normal_data/ner',
18 |     'model_type': 'bert',  # bert, nezha
19 |     'model_name_or_path': 'hfl/chinese-roberta-wwm-ext',  # '/data/chenzhihao/nezha-base-www'
20 |     'model_sate_dict_path': '../data/output/bert/best_model',   # 保存的checkpoint文件地址用于继续训练
21 |     'output_dir': '../data/output/',  # 模型训练中保存的中间结果，模型，日志等文件的主目录False
22 |     'do_lower_case': False,  # 主要是tokenize时是否将大写转为小写
23 |     'cache_dir': '',   # 指定下载的预训练模型保存地址
24 |     'evaluate_during_training': True,  # 是否在训练过程中验证模型, 默认为True
25 |     'use_lstm': False,  # 默认为False, 表示模型结构为bert_crf
26 |     'from_scratch': True,  # 是否从头开始训练，默认为True
27 |     'from_last_checkpoint': False,  # 是否从最新的checkpoint模型继续训练，默认为False
28 |     'early_stop': False,
29 |     'overwrite_output_dir': True,
30 |     'overwrite_cache': True,  # 是否重写特征，默认为True，若为False表示从特征文件中加载特征
31 |     'no_cuda': False,  # 是否使用GPU。默认为False, 表示只使用CPU
32 |     'fp16': True,
33 |     'train_max_seq_length': 32,  # 默认为512
34 |     'eval_max_seq_length': 32,  # 默认为512
35 |     'per_gpu_train_batch_size': 16,
36 |     'per_gpu_eval_batch_size': 16,
37 |     'gradient_accumulation_steps': 1,
38 |     'learning_rate': 5e-05,  # bert和lstm的学习率
39 |     'crf_learning_rate': 5e-05,
40 |     'weight_decay': 0.01,
41 |     'adam_epsilon': 1e-08,
42 |     'warmup_proportion': 0.1,
43 |     'num_train_epochs': 3.0,
44 |     'max_steps': -1,  # 当指定了该字段值后，'num_train_epochs'就不起作用了
45 |     'tolerance': 5,   # 指定early stop容忍的epoch数量
46 |     'logging_steps': 500,  # 指定tensorboard日志在哪个阶段记录
47 |     'save_steps': 500,  # 指定哪些步骤保存中间训练结果
48 |     # ["linear","cosine","cosine_with_restarts","polynomial","constant","constant_with_warmup"]
49 |     'scheduler_type': 'linear',
50 |     'cuda_number': '3',   # '0,1,2,3' 使用GPU时需指定GPU卡号
51 |     'seed': 2333,
52 |     'dropout_rate': 0.3
53 | }
54 | 
55 | args = DataAndTrainArguments(**config)  # noqa
56 | extractor = EventExtractor(args)
57 | 
58 | # training from scratch, set config['from_scratch'] = True
59 | extractor.train_and_valid()
60 | 
61 | # continue train from 'model_sate_dict_path', set config['from_scratch'] = False
62 | # extractor.train_and_valid()
63 | 
64 | # continue train from last checkpoint file, set config['from_scratch'] = False, config['from_last_checkpoint']=True.
65 | # And should rise the 'num_train_epochs'
66 | # extractor.train_and_valid()
67 | 


--------------------------------------------------------------------------------
/experiments/ner/lear_for_ner.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: lear_for_ner
 6 |     Author: czh
 7 |     Create Date: 2022/3/11
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import os
13 | import sys
14 | import json
15 | from tqdm import tqdm
16 | import codecs
17 | from typing import List, Tuple
18 | sys.path.append("/data/chenzhihao/NLP")
19 | 
20 | import numpy as np
21 | import torch
22 | import torch.nn as nn
23 | from torch.optim import AdamW
24 | from torch.utils.data import DataLoader, Dataset
25 | from transformers import BertConfig, BertTokenizerFast
26 | 
27 | from nlp.models.bert_for_ner import LearForNer
28 | from nlp.tools.path import project_root_path
29 | 


--------------------------------------------------------------------------------
/experiments/ner/run_ner_tplinker_v2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: run_ner_tplinker_v2
 6 |     Author: czh
 7 |     Create Date: 2021/8/24
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import time
13 | 
14 | from nlp.utils.tplinker_plus_utils import DataAndTrainArguments
15 | from nlp.models.tplinker_plus_for_ner import TPLinkerPlusForNER
16 | 
17 | root_dir = "/data/chenzhihao/NLP/experiments"
18 | config = {
19 |     "bert_name_or_path": "hfl/chinese-roberta-wwm-ext",
20 |     "data_dir": root_dir + "/datas/tplinker",
21 |     "task_name": "ner",
22 |     "model_type": "BERT",
23 |     "train_data_name": "train_data.json",
24 |     "valid_data_name": "valid_data.json",
25 |     "ent2id": "ent2id.json",
26 |     "output_dir": root_dir + "/output_file_dir/tplinker_plus_ner_bert/train_results",
27 |     "log_dir": root_dir + "/logs/tplinker_plus_ner.log",
28 |     "tensorboard_log_dir": root_dir + "/tensorboard/tplinker_plus_ner/",
29 |     "path_to_save_model": root_dir + "/output_file_dir/tplinker_plus_ner_bert/train_results/best_model",
30 |     "model_state_dict_path": root_dir + "/output_file_dir/tplinker_plus_ner_bert/train_results/best_model",
31 |     "save_res_dir": root_dir + "/output_file_dir/tplinker_plus_ner_bert/eval_results",
32 |     "score": True,  # set true only if test set is tagged
33 |     "n_gpu": "0",
34 |     "num_workers": 4,
35 |     "logger": "default",
36 |     "train_batch_size": 16,
37 |     "eval_batch_size": 16,
38 |     "epochs": 4,
39 |     "fp16": True,
40 |     "gradient_accumulation_steps": 1,
41 |     "shaking_type": "cln_plus",
42 |     "match_pattern": "whole_text",
43 |     "inner_enc_type": "lstm",
44 |     "f1_2_save": 0,
45 |     "fr_scratch": True,
46 |     "fr_last_checkpoint": False,
47 |     "note": "start from scratch",
48 |     "log_interval": 10,
49 |     "max_seq_len": 512,
50 |     "sliding_len": 20,
51 |     "last_k_model": 1,
52 |     "scheduler": "CAWR",  # Step
53 |     "ghm": False,
54 |     "tok_pair_sample_rate": 1,
55 |     "force_split": False,
56 |     "lr": 5e-5,
57 |     "T_mult": 1,
58 |     "rewarm_epoch_num": 2,
59 |     "save_steps": 500,
60 |     "logging_steps": 500
61 | }
62 | 
63 | start = time.time()
64 | args = DataAndTrainArguments(**config)
65 | # print(args.__dict__)
66 | trainer = TPLinkerPlusForNER(args)
67 | trainer.init_env()
68 | 
69 | # training
70 | trainer.train_and_valid()
71 | print(time.time()-start)
72 | 
73 | start = time.time()
74 | # evaluating
75 | trainer.evaluate()
76 | print(time.time()-start)
77 | 
78 | # predicting
79 | start = time.time()
80 | text = "百炼智能是一家人工智能科技公司，公司CEO是冯是聪"
81 | trainer.init_others(len(text)+2)
82 | model = trainer.init_model(16)
83 | trainer.restore(model)
84 | test_data, ori_test_data, max_seq_len = trainer.process_predict_data(text, max_seq_len=len(text)+2)
85 | result = trainer.predict(test_data=test_data,
86 |                          ori_test_data=ori_test_data,
87 |                          model=model,
88 |                          max_seq_len=max_seq_len,
89 |                          batch_size=1)
90 | print(result)
91 | print(time.time()-start)
92 | 


--------------------------------------------------------------------------------
/experiments/relation_extraction/train_relation_extraction_by_globalpointer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: train_relation_extraction_by_globalpointer
 6 |     Author: czh
 7 |     Create Date: 2022/2/9
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import os
13 | import regex
14 | import glob
15 | import json
16 | import time
17 | from tqdm import tqdm
18 | import typing
19 | import codecs
20 | from collections import defaultdict
21 | 
22 | import torch
23 | import torch.nn as nn
24 | from torch.optim import AdamW
25 | from torch.utils.data import DataLoader
26 | from transformers import BertConfig, BertTokenizerFast, HfArgumentParser, get_scheduler
27 | 
28 | from nlp.models.bert_for_relation_extraction import GlobalPointerForRel
29 | 
30 | 


--------------------------------------------------------------------------------
/experiments/scripts/decode_yunwen_unilm_for_se2seq.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318
 3 | MODEL_PATH=/root/work2/work2/chenzhihao/pretrained_models/torch_unilm_model
 4 | MODEL_RECOVER_PATH=$PROJECT_PATH/datas/output_dir/unilm/yunwen_unilm/seq2seq_on_natural_conv/
 5 | OUTPUT_FILE=$PROJECT_PATH/datas/output_dir/unilm/yunwen_unilm/seq2seq_on_natural_conv/predict_.json
 6 | 
 7 | MODEL_TYPE="unilm"
 8 | 
 9 | export CUDA_VISIBLE_DEVICES=7
10 | python $PROJECT_PATH/experiments/single_test/decode_yunwen_unilm_for_seq2seq.py \
11 |   --model_type=$MODEL_TYPE \
12 |   --model_name_or_path $MODEL_PATH \
13 |   --model_recover_path=$MODEL_RECOVER_PATH \
14 |   --input_file=$DATA_PATH/test.json \
15 |   --split="test" \
16 |   --max_seq_length=512 \
17 |   --do_lower_case \
18 |   --batch_size=32 \
19 |   --beam_size=5 \
20 |   --max_tgt_length=128


--------------------------------------------------------------------------------
/experiments/scripts/finetune_cpm_large_2.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318
 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CPM-natural_conv/
 4 | LOGGING_DIR=$OUTPUT_DIR/logs
 5 | MODEL_PATH="/root/work2/work2/chenzhihao/pretrained_models/CPM-generate"
 6 | 
 7 | PROJECT_NAME='nlp'
 8 | EXPERIMENT_NAME='cpm1-natural_conv'
 9 | GROUP_NAME='cpm1_generate'
10 | SPEAKER1="用户："
11 | SPEAKER2="\n机器人："
12 | 
13 | #export CUDA_VISIBLE_DEVICES=2,3
14 | #accelerate config
15 | accelerate launch $PROJECT_PATH/experiments/qa_and_text_generation/finetune_cpm_large_2.py \
16 |   --pretrained \
17 |   --model_checkpoint=$MODEL_PATH \
18 |   --config_path=$MODEL_PATH/config.json \
19 |   --tokenizer_path=$MODEL_PATH \
20 |   --data_path=$DATA_PATH \
21 |   --data_type="natural_conv" \
22 |   --output_dir=$OUTPUT_DIR \
23 |   --logging_dir=$LOGGING_DIR \
24 |   --project_name=$PROJECT_NAME \
25 |   --experiment_name=$EXPERIMENT_NAME \
26 |   --group_name=$GROUP_NAME \
27 |   --speaker1=$SPEAKER1 \
28 |   --speaker2=$SPEAKER2 \
29 |   --scheduler="linear" \
30 |   --num_epochs=15 \
31 |   --do_train \
32 |   --do_valid \
33 |   --do_test \
34 |   --train_path="train.txt" \
35 |   --valid_path="dev.txt" \
36 |   --test_path="test.txt" \
37 |   --train_batch_size=4 \
38 |   --valid_batch_size=4 \
39 |   --lr=2e-5 \
40 |   --warmup_steps=2000 \
41 |   --valid_steps=500 \
42 |   --gradient_accumulation_steps=32 \
43 |   --local_rank=0 \
44 |   --mixed_precision='fp16' \
45 |   --seed=2333 \
46 |   --with_tracking \
47 |   --max_seq_length=512 \
48 |   --max_history=10 \
49 |   --do_sample \
50 |   --top_k=0 \
51 |   --top_p=0.9 \
52 |   --temperature=0.75 \
53 |   --output_max_length=256 \
54 |   --output_min_length=2


--------------------------------------------------------------------------------
/experiments/scripts/finetune_cpm_large_accelerate.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data2/work2/chenzhihao/NLP
 2 | DATA_PATH=$PROJECT_PATH/datas/raw_datas/
 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CPM-large2/
 4 | LOGGING_DIR=$OUTPUT_DIR/logs
 5 | MODEL_PATH="/data2/work2/chenzhihao/pretrained_models/CPM-generate"
 6 | 
 7 | #export CUDA_VISIBLE_DEVICES=2,3
 8 | #accelerate configs
 9 | accelerate launch $PROJECT_PATH/examples/qa_and_text_generation/finetune_cpm_large_accelerate.py \
10 |   --pretrained \
11 |   --model_checkpoint=$MODEL_PATH \
12 |   --config_path=$MODEL_PATH/config.json \
13 |   --tokenizer_path=$MODEL_PATH \
14 |   --data_path=$DATA_PATH \
15 |   --output_dir=$OUTPUT_DIR \
16 |   --logging_dir=$LOGGING_DIR \
17 |   --scheduler="linear" \
18 |   --num_epochs=15 \
19 |   --train_batch_size=4 \
20 |   --valid_batch_size=4 \
21 |   --lr=2e-5 \
22 |   --warmup_steps=2000 \
23 |   --valid_steps=2000 \
24 |   --gradient_accumulation_steps=32 \
25 |   --local_rank=0 \
26 |   --mixed_precision='fp16' \
27 |   --seed=2333 \
28 |   --do_train \
29 |   --do_valid \
30 |   --do_test \
31 |   --with_tracking \
32 |   --max_seq_length=512 \
33 |   --do_sample \
34 |   --top_k=0 \
35 |   --top_p=0.0 \
36 |   --temperature=1.0 \
37 |   --output_max_length=256 \
38 |   --output_min_length=5


--------------------------------------------------------------------------------
/experiments/scripts/finetune_liadrinz_unilm.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318/corpus.txt
 3 | MODEL_TYPE=unilm
 4 | MODEL_NAME=/root/work2/work2/chenzhihao/pretrained_models/unilm-chinese-base
 5 | #MODEL_NAME=peterchou/unilm-chinese-base
 6 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/unilm/liadrinz_unilm/seq2seq_on_natural_conv
 7 | 
 8 | export WANDB_DISABLED=true
 9 | export CUDA_VISIBLE_DEVICES=7
10 | export OMP_NUM_THREADS=1
11 | python3 -u $PROJECT_PATH/experiments/qa_and_text_generation/finetune_unilm_for_seq2seq_liadrinz.py \
12 |     train \
13 |     --model_type ${MODEL_TYPE} \
14 |     --model_name_or_path ${MODEL_NAME} \
15 |     --batch_size 32 \
16 |     --corpus_file $DATA_PATH \
17 |     --max_seq_len 512 \
18 |     --seed 42 \
19 |     --output_dir ${OUTPUT_DIR} \
20 |     --gradient_accumulation_steps 2 \
21 |     --lr=2e-5 \
22 |     --num_train_epochs 5 \
23 |     --mask_prob 0.2 \
24 |     --local_rank=-1 \
25 |     --fp16


--------------------------------------------------------------------------------
/experiments/scripts/finetune_qwen_7b_qlora.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | ROOT_PATH=/home/aiteam/work2/NLP
 4 | MODEL_NAME_OR_PATH=/home/aiteam/work/pretrained_models/Qwen-7B-Chat
 5 | MODEL_TYPE=qwen-7b-chat #bloom,llama
 6 | 
 7 | DATA_DIR=${ROOT_PATH}/datas/firefly
 8 | OUTPUT_DIR=${ROOT_PATH}/datas/output_dir/${MODEL_TYPE}/instruct_qlora
 9 | mkdir -p ${OUTPUT_DIR}
10 | 
11 | CACHE_DIR=${ROOT_PATH}/datas/hf_cache_dir/${MODEL_TYPE}/instruct_qlora
12 | mkdir -p ${CACHE_DIR}
13 | 
14 | CUTOFF_LEN=1024
15 | SYSTEM_PROMPT=""
16 | 
17 | # V100不支持lora+fp16
18 | CUDA_VISIBLE_DEVICES="6" torchrun --nproc_per_node 1 ${ROOT_PATH}/experiments/qa_and_txt_generation/finetune_llm_chat.py \
19 |     --model_name_or_path ${MODEL_NAME_OR_PATH} \
20 |     --model_type ${MODEL_TYPE} \
21 |     --use_qlora True \
22 |     --bits 4 \
23 |     --lora_config ${ROOT_PATH}/config/lora_config_llama.json \
24 |     --train_file ${DATA_DIR}/train.json \
25 |     --validation_file ${DATA_DIR}/dev.json \
26 |     --chat_format 'chatml' \
27 |     --source_prefix "human" \
28 |     --target_prefix "assistant" \
29 |     --system_prompt $SYSTEM_PROMPT \
30 |     --per_device_train_batch_size 2 \
31 |     --per_device_eval_batch_size 2 \
32 |     --gradient_accumulation_steps 8 \
33 |     --num_train_epochs 3 \
34 |     --model_max_length ${CUTOFF_LEN} \
35 |     --save_strategy "steps" \
36 |     --save_steps 100 \
37 |     --learning_rate 8e-6 \
38 |     --weight_decay 0.00001 \
39 |     --warmup_ratio 0.05 \
40 |     --lr_scheduler_type "cosine" \
41 |     --logging_steps 50 \
42 |     --logging_strategy "steps" \
43 |     --evaluation_strategy "steps" \
44 |     --eval_steps 100 \
45 |     --fp16 True \
46 |     --seed 1234 \
47 |     --gradient_checkpointing True \
48 |     --cache_dir ${CACHE_DIR} \
49 |     --report_to "all" \
50 |     --output_dir ${OUTPUT_DIR}
51 | #    --save_total_limit 5 \
52 | #    --metric_for_best_model "rouge-l" \
53 | #    --predict_with_generate True
54 | #    --optim paged_adamw_32bit
55 | 


--------------------------------------------------------------------------------
/experiments/scripts/finetune_yunwen_unilm.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318
 3 | MODEL_PATH=/root/work2/work2/chenzhihao/pretrained_models/torch_unilm_model
 4 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/unilm/yunwen_unilm/seq2seq_on_natural_conv
 5 | LOGGING_DIR=$OUTPUT_DIR/logs
 6 | 
 7 | MODEL_TYPE="unilm"
 8 | SOURCE_NAME="source"
 9 | TARGET_NAME="target"
10 | 
11 | export CUDA_VISIBLE_DEVICES=7
12 | python $PROJECT_PATH/experiments/qa_and_text_generation/finetune_unilm_for_seq2seq_yunwen.py \
13 |   --data_dir $DATA_PATH \
14 |   --model_type=$MODEL_TYPE \
15 |   --model_name_or_path $MODEL_PATH \
16 |   --output_dir $OUTPUT_DIR \
17 |   --log_dir $LOGGING_DIR \
18 |   --src_file="source.json" \
19 |   --source=$SOURCE_NAME \
20 |   --target=$TARGET_NAME \
21 |   --max_seq_length=512 \
22 |   --max_position_embeddings=512 \
23 |   --do_train \
24 |   --do_lower_case \
25 |   --train_batch_size=32 \
26 |   --learning_rate=1e-5 \
27 |   --num_train_epochs=10 \
28 |   --scheduler="linear" \
29 |   --local_rank=-1 \
30 |   --gradient_accumulation_steps=1 \
31 |   --seed=2333 \
32 |   --fp16 \
33 |   --fp16_opt_level='O1'
34 | 
35 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_child_tuning_on_ner.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir
 6 | LOG_DIR=$CURRENT_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | TASK_NAME="cluener"
 9 | MODEL_TYPE="bert"
10 | 
11 | python /data/chenzhihao/NLP/experiments/others/child_tuning_on_ner.py \
12 |   --model_name_or_path=$BERT_BASE_DIR \
13 |   --model_type=$MODEL_TYPE \
14 |   --task_name=$TASK_NAME \
15 |   --do_train \
16 |   --do_eval \
17 |   --do_predict_no_tag \
18 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
19 |   --max_seq_length=256 \
20 |   --per_gpu_train_batch_size=32 \
21 |   --per_gpu_eval_batch_size=32 \
22 |   --learning_rate=3e-5 \
23 |   --crf_learning_rate=3e-3 \
24 |   --num_train_epochs=30.0 \
25 |   --fp16 \
26 |   --fp16_backend=amp \
27 |   --gradient_accumulation_steps=1 \
28 |   --warmup_ratio=0.1 \
29 |   --logging_steps=500 \
30 |   --save_steps=500 \
31 |   --eval_steps=1000 \
32 |   --save_total_limit=10 \
33 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \
34 |   --logging_dir=$LOG_DIR/ \
35 |   --overwrite_output_dir \
36 |   --overwrite_cache \
37 |   --seed=42 \
38 |   --cuda_number=0 \
39 |   --markup=bios \
40 |   --reserve_p=0.3 \
41 |   --mode=ChildTuning-D \
42 |   --metric_for_best_model=f1 \
43 |   --greater_is_better


--------------------------------------------------------------------------------
/experiments/scripts/run_duee1_sujianlin.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir
 6 | LOG_DIR=$CURRENT_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | DATA_FORMAT='duee1'
 9 | TASK_NAME="news2"
10 | MODEL_TYPE="bert"  # bert, nezha, roformer
11 | CUDA_NUMBERS='1' # '0,1,2,3'
12 | SCHEDULER_TYPE='linear'  # ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
13 | 
14 | python $CURRENT_DIR/relation_extraction/train_ee.py \
15 |   --model_type=$MODEL_TYPE \
16 |   --model_name_or_path=$BERT_BASE_DIR \
17 |   --data_format=$DATA_FORMAT \
18 |   --task_name=$TASK_NAME \
19 |   --do_train \
20 |   --do_eval \
21 |   --evaluate_during_training \
22 |   --do_eval_per_epoch \
23 |   --do_predict_tag \
24 |   --do_eval_per_epoch \
25 |   --use_lstm \
26 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
27 |   --train_max_seq_length=128 \
28 |   --eval_max_seq_length=128 \
29 |   --per_gpu_train_batch_size=32 \
30 |   --per_gpu_eval_batch_size=32 \
31 |   --learning_rate=3e-5 \
32 |   --crf_learning_rate=2e-3 \
33 |   --num_train_epochs=30.0 \
34 |   --fp16 \
35 |   --local_rank -1 \
36 |   --gradient_accumulation_steps=1 \
37 |   --logging_steps=500 \
38 |   --save_steps=500 \
39 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \
40 |   --logging_dir=$LOG_DIR/ \
41 |   --overwrite_output_dir \
42 |   --overwrite_cache \
43 |   --seed=2333 \
44 |   --cuda_number=$CUDA_NUMBERS \
45 |   --scheduler_type=$SCHEDULER_TYPE


--------------------------------------------------------------------------------
/experiments/scripts/run_finetune_cdail_gpt.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=$PROJECT_PATH/datas/raw_datas/
 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CDail-GPT-QA
 4 | MODEL_CHECKPOINT=$PROJECT_PATH/datas/output_dir/CDail-GPT-QA
 5 | 
 6 | python $PROJECT_PATH/experiments/qa_and_text_generation/finetune_cdail_gpt.py \
 7 |   --pretrained \
 8 |   --model_checkpoint=$MODEL_CHECKPOINT \
 9 |   --data_path=$DATA_PATH \
10 |   --output_dir=$OUTPUT_DIR \
11 |   --scheduler="linear" \
12 |   --n_epochs=30 \
13 |   --train_batch_size=12 \
14 |   --valid_batch_size=12 \
15 |   --lr=5e-5 \
16 |   --warmup_steps=5000 \
17 |   --valid_steps=5000 \
18 |   --gradient_accumulation_steps=64 \
19 |   --local_rank=-1 \
20 |   --fp16='01' \
21 |   --fp16_backend='amp' \
22 |   --device='cuda:0' \
23 |   --do_train \
24 |   --do_valid


--------------------------------------------------------------------------------
/experiments/scripts/run_finetune_cdail_gpt_2.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318
 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CDail-GPT
 4 | MODEL_CHECKPOINT=/root/work2/work2/chenzhihao/pretrained_models/CDial-GPT_LCCC-large
 5 | 
 6 | python $PROJECT_PATH/experiments/qa_and_text_generation/finetune_cdail_gpt_2.py \
 7 |   --pretrained \
 8 |   --model_checkpoint=$MODEL_CHECKPOINT \
 9 |   --data_path=$DATA_PATH \
10 |   --data_type="natural_conv" \
11 |   --output_dir=$OUTPUT_DIR \
12 |   --scheduler="linear" \
13 |   --n_epochs=30 \
14 |   --max_history=10 \
15 |   --do_train \
16 |   --do_valid \
17 |   --do_test \
18 |   --train_path="train.txt" \
19 |   --valid_path="dev.txt" \
20 |   --test_path="test.txt" \
21 |   --train_batch_size=16 \
22 |   --valid_batch_size=16 \
23 |   --lr=5e-5 \
24 |   --warmup_steps=5000 \
25 |   --valid_steps=500 \
26 |   --gradient_accumulation_steps=1 \
27 |   --local_rank=-1 \
28 |   --fp16='01' \
29 |   --fp16_backend='amp' \
30 |   --device='cuda:1'


--------------------------------------------------------------------------------
/experiments/scripts/run_global_pointer_for_ner.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$DATA_DIR/output_file_dir
 6 | LOG_DIR=$DATA_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | TASK_NAME="cluener"
 9 | MODEL_TYPE="bert"
10 | 
11 | python $CURRENT_DIR/ner/train_globalpointer.py \
12 |   --model_type=$MODEL_TYPE \
13 |   --model_name_or_path=$BERT_BASE_DIR \
14 |   --task_name=$TASK_NAME \
15 |   --do_train \
16 |   --do_eval \
17 |   --do_predict_no_tag \
18 |   --do_lower_case \
19 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
20 |   --max_seq_length=256 \
21 |   --sliding_len=100 \
22 |   --per_gpu_train_batch_size=64 \
23 |   --per_gpu_eval_batch_size=64 \
24 |   --learning_rate=2e-5 \
25 |   --num_train_epochs=100 \
26 |   --fp16 \
27 |   --fp16_backend=amp \
28 |   --warmup_ratio=0.1 \
29 |   --local_rank -1 \
30 |   --gradient_accumulation_steps=1 \
31 |   --logging_steps=500 \
32 |   --save_steps=500 \
33 |   --eval_steps=500 \
34 |   --save_total_limit=5 \
35 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}/ \
36 |   --logging_dir=$LOG_DIR/ \
37 |   --overwrite_output_dir \
38 |   --overwrite_cache \
39 |   --seed=2333 \
40 |   --cuda_number=0 \
41 |   --dataloader_num_workers=2 \
42 |   --scheduler_type=linear \
43 |   --metric_for_best_model=f1 \
44 |   --greater_is_better \
45 |   --rope \
46 |   --reserve_p=0.3 \
47 |   --mode=ChildTuning-D


--------------------------------------------------------------------------------
/experiments/scripts/run_ner_crf.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir
 6 | LOG_DIR=$CURRENT_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | TASK_NAME="cluener"
 9 | MODEL_TYPE="bert"  # bert,nezha,albert,roformer
10 | 
11 | python /data/chenzhihao/NLP/experiments/ner/run_ner_crf.py \
12 |   --model_type=$MODEL_TYPE \
13 |   --model_name_or_path=$BERT_BASE_DIR \
14 |   --task_name=$TASK_NAME \
15 |   --do_train \
16 |   --do_eval \
17 |   --do_predict_no_tag \
18 |   --do_adv \
19 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
20 |   --max_seq_length=256 \
21 |   --per_gpu_train_batch_size=32 \
22 |   --per_gpu_eval_batch_size=32 \
23 |   --learning_rate=3e-5 \
24 |   --crf_learning_rate=2e-3 \
25 |   --num_train_epochs=30.0 \
26 |   --fp16 \
27 |   --fp16_backend=amp \
28 |   --warmup_ratio=0.1 \
29 |   --local_rank -1 \
30 |   --gradient_accumulation_steps=1 \
31 |   --logging_steps=500 \
32 |   --save_steps=500 \
33 |   --eval_steps=1000 \
34 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}/ \
35 |   --logging_dir=$LOG_DIR/ \
36 |   --overwrite_output_dir \
37 |   --overwrite_cache \
38 |   --seed=42 \
39 |   --cuda_number=0 \
40 |   --markup=bios \
41 |   --metric_for_best_model=f1 \
42 |   --greater_is_better


--------------------------------------------------------------------------------
/experiments/scripts/run_ner_softmax.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir
 6 | LOG_DIR=$CURRENT_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | TASK_NAME="cluener"
 9 | MODEL_TYPE="bert"  # bert,nezha,albert
10 | 
11 | python /data/chenzhihao/NLP/experiments/ner/run_ner_softmax.py \
12 |   --model_type=$MODEL_TYPE \
13 |   --model_name_or_path=$BERT_BASE_DIR \
14 |   --task_name=$TASK_NAME \
15 |   --do_train \
16 |   --do_eval \
17 |   --do_predict_no_tag \
18 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
19 |   --max_seq_length=256 \
20 |   --per_gpu_train_batch_size=32 \
21 |   --per_gpu_eval_batch_size=32 \
22 |   --learning_rate=3e-5 \
23 |   --crf_learning_rate=2e-3 \
24 |   --num_train_epochs=30.0 \
25 |   --fp16 \
26 |   --fp16_backend=amp \
27 |   --warmup_ratio=0.1 \
28 |   --local_rank -1 \
29 |   --gradient_accumulation_steps=1 \
30 |   --logging_steps=500 \
31 |   --save_steps=500 \
32 |   --eval_steps=1000 \
33 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \
34 |   --logging_dir=$LOG_DIR/ \
35 |   --overwrite_output_dir \
36 |   --overwrite_cache \
37 |   --seed=42 \
38 |   --cuda_number=0 \
39 |   --markup=bios \
40 |   --metric_for_best_model=f1 \
41 |   --greater_is_better


--------------------------------------------------------------------------------
/experiments/scripts/run_ner_span.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir
 6 | LOG_DIR=$CURRENT_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | #BERT_BASE_DIR="hfl/chinese-roberta-wwm-ext"
 9 | TASK_NAME="cluener"
10 | MODEL_TYPE="bert"  # bert,nezha,albert
11 | 
12 | python /data/chenzhihao/NLP/experiments/ner/run_ner_span.py \
13 |   --model_type=$MODEL_TYPE \
14 |   --model_name_or_path=$BERT_BASE_DIR \
15 |   --task_name=$TASK_NAME \
16 |   --do_train \
17 |   --do_eval \
18 |   --do_predict_no_tag \
19 |   --do_adv \
20 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
21 |   --max_seq_length=256 \
22 |   --per_gpu_train_batch_size=32 \
23 |   --per_gpu_eval_batch_size=32 \
24 |   --learning_rate=3e-5 \
25 |   --crf_learning_rate=2e-3 \
26 |   --num_train_epochs=30.0 \
27 |   --fp16 \
28 |   --fp16_backend=amp \
29 |   --warmup_ratio=0.1 \
30 |   --local_rank -1 \
31 |   --gradient_accumulation_steps=1 \
32 |   --logging_steps=500 \
33 |   --save_steps=500 \
34 |   --eval_steps=1000 \
35 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \
36 |   --logging_dir=$LOG_DIR/ \
37 |   --overwrite_output_dir \
38 |   --overwrite_cache \
39 |   --seed=42 \
40 |   --cuda_number=0 \
41 |   --markup=bios \
42 |   --metric_for_best_model=f1 \
43 |   --greater_is_better


--------------------------------------------------------------------------------
/experiments/scripts/run_pair_sup_con.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/chinese-semantics-match-dataset/
 3 | OUTPUT_DIR=$PROJECT_PATH/experiments/output_file_dir/semantic_match
 4 | MODEL_PATH="/root/work2/work2/chenzhihao/pretrained_models/chinese-roberta-wwm-ext"
 5 | 
 6 | PROJECT_NAME='semantic_match'
 7 | EXPERIMENT_NAME='atec-pairsupcon-roberta-wwm-ext'
 8 | GROUP_NAME='nlp'
 9 | MODEL_TYPE='roberta-wwm-ext'
10 | DATA_TYPE='ATEC'
11 | 
12 | 
13 | python $PROJECT_PATH/experiments/sentence_embedding/run_pair_sup_con.py \
14 |   --model_type=$MODEL_TYPE \
15 |   --model_name_or_path=$MODEL_PATH \
16 |   --output_dir=$OUTPUT_DIR \
17 |   --project_name=$PROJECT_NAME \
18 |   --group_name=$GROUP_NAME \
19 |   --experiment_name=$EXPERIMENT_NAME \
20 |   --data_dir=$DATA_PATH \
21 |   --data_type=$DATA_TYPE \
22 |   --do_train \
23 |   --do_valid \
24 |   --do_test \
25 |   --train_dataset='train.txt' \
26 |   --valid_dataset='dev.txt' \
27 |   --test_dataset='test.txt' \
28 |   --max_seq_length=128 \
29 |   --lr_rate=5e-05 \
30 |   --lr_scale 100 \
31 |   --gradient_accumulation_steps=1 \
32 |   --scheduler_type='linear' \
33 |   --train_batch_size=64 \
34 |   --valid_batch_size=64 \
35 |   --num_train_epochs=200 \
36 |   --gpuid=0 \
37 |   --seed=2333 \
38 |   --num_worker=0 \
39 |   --num_labels=2 \
40 |   --temperature=0.05 \
41 |   --task_type='pairsupcon' \
42 |   --contrast_type="HardNeg" \
43 |   --beta=1 \
44 |   --fp16
45 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_sbert_training_dynamics.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=/root/work2/work2/chenzhihao/datasets/chinese-semantics-match-dataset/
 5 | OUTPUT_DIR=$CURRENT_DIR/output_file_dir/semantic_match
 6 | LOG_PATH=$OUTPUT_DIR/dy_logs/
 7 | 
 8 | MODLE_TYPE='roberta-wwm'
 9 | MODLE_NAME_OR_PATH=/root/work2/work2/chenzhihao/pretrained_models/chinese-roberta-wwm-ext
10 | DATA_TYPE='BQ'
11 | OBJECT_TYPE='classification'
12 | TASK_TYPE='match'
13 | POOLING_STRATEGY='first-last-avg'
14 | PROJECT_NAME='sup-sbert'
15 | EXPERIMENT_NAME='sbert-training-dynamics'
16 | GROUP_NAME='semantic_match'
17 | 
18 | python $CURRENT_DIR/sentence_embedding/train_sentence_bert_training_dynamics.py \
19 |   --model_type $MODLE_TYPE \
20 |   --model_name_or_path $MODLE_NAME_OR_PATH \
21 |   --data_dir $DATA_DIR \
22 |   --output_dir $OUTPUT_DIR \
23 |   --dy_log_path $LOG_PATH \
24 |   --data_type $DATA_TYPE \
25 |   --task_type $TASK_TYPE \
26 |   --object_type $OBJECT_TYPE \
27 |   --pooling_strategy $POOLING_STRATEGY \
28 |   --project_name $PROJECT_NAME \
29 |   --experiment_name $EXPERIMENT_NAME \
30 |   --group_name $GROUP_NAME \
31 |   --do_train \
32 |   --do_valid \
33 |   --do_test \
34 |   --do_recording \
35 |   --max_seq_length=128 \
36 |   --num_train_epochs=32 \
37 |   --valid_batch_size=32 \
38 |   --test_batch_size=32 \
39 |   --valid_steps=500 \
40 |   --num_labels=2 \
41 |   --lr_rate=2e-5 \
42 |   --gradient_accumulation_steps=1 \
43 |   --scheduler_type='linear' \
44 |   --num_workers=0 \
45 |   --cuda_number=7
46 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_spn4re.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/data/chenzhihao/NLP
 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/
 3 | CURRENT_DIR=$PROJECT_PATH/experiments
 4 | DATA_DIR=$PROJECT_PATH/datas
 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir
 6 | LOG_DIR=$CURRENT_DIR/logs
 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext
 8 | TASK_NAME="spnre_business_chance"
 9 | MODEL_TYPE="bert"  # bert,nezha,albert,roformer
10 | 
11 | python /data/chenzhihao/NLP/experiments/relation_extraction/train_spn4re.py \
12 |   --model_type=$MODEL_TYPE \
13 |   --model_name_or_path=$BERT_BASE_DIR \
14 |   --task_name=$TASK_NAME \
15 |   --do_train \
16 |   --do_eval \
17 |   --do_predict_no_tag \
18 |   --data_dir=$DATA_DIR/${TASK_NAME}/ \
19 |   --max_seq_length=512 \
20 |   --per_gpu_train_batch_size=16 \
21 |   --per_gpu_eval_batch_size=16 \
22 |   --learning_rate=3e-5 \
23 |   --crf_learning_rate=2e-3 \
24 |   --num_train_epochs=100.0 \
25 |   --fp16 \
26 |   --fp16_backend=amp \
27 |   --warmup_ratio=0.1 \
28 |   --local_rank -1 \
29 |   --gradient_accumulation_steps=1 \
30 |   --logging_steps=500 \
31 |   --save_steps=500 \
32 |   --eval_steps=500 \
33 |   --output_dir=$OUTPUR_DIR/${TASK_NAME}/ \
34 |   --logging_dir=$LOG_DIR/ \
35 |   --overwrite_output_dir \
36 |   --overwrite_cache \
37 |   --seed=2333 \
38 |   --cuda_number=0 \
39 |   --markup=bios \
40 |   --metric_for_best_model=f1 \
41 |   --greater_is_better \
42 |   --sliding_len=20 \
43 |   --relation_labels="BUSEXP,ORGFIN,PERUP,STRCOO" \
44 |   --num_generated_tuples=10 \
45 |   --num_entities_in_tuple=8 \
46 |   --allow_null_entities_in_tuple="0,0,1,1,1,1,1,1" \
47 |   --entity_loss_weight="2,2,2,2,2,2,2,2"
48 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_unsup_vascl.sh:
--------------------------------------------------------------------------------
 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP
 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/chinese-semantics-match-dataset/
 3 | OUTPUT_DIR=$PROJECT_PATH/experiments/output_file_dir/semantic_match
 4 | MODEL_PATH="/root/work2/work2/chenzhihao/pretrained_models/chinese-roberta-wwm-ext"
 5 | 
 6 | PROJECT_NAME='semantic_match'
 7 | EXPERIMENT_NAME='sts-b-unsup_vascl-roberta-wwm-ext'
 8 | GROUP_NAME='nlp'
 9 | MODEL_TYPE='roberta-wwm-ext'
10 | DATA_TYPE='STS-B'
11 | 
12 | python $PROJECT_PATH/experiments/sentence_embedding/run_unsup_vascl.py \
13 |   --model_type=$MODEL_TYPE \
14 |   --model_name_or_path=$MODEL_PATH \
15 |   --output_dir=$OUTPUT_DIR \
16 |   --project_name=$PROJECT_NAME \
17 |   --group_name=$GROUP_NAME \
18 |   --experiment_name=$EXPERIMENT_NAME \
19 |   --data_dir=$DATA_PATH \
20 |   --data_type=$DATA_TYPE \
21 |   --do_train \
22 |   --do_valid \
23 |   --do_test \
24 |   --max_seq_length=64 \
25 |   --lr_rate=2e-5 \
26 |   --lr_scale=1000 \
27 |   --gradient_accumulation_steps=1 \
28 |   --scheduler_type='linear' \
29 |   --train_batch_size=256 \
30 |   --valid_batch_size=128 \
31 |   --num_train_epochs=100 \
32 |   --gpuid=2 \
33 |   --seed=2333 \
34 |   --num_worker=0 \
35 |   --temperature=0.05 \
36 |   --topk=16 \
37 |   --eps=15
38 | 


--------------------------------------------------------------------------------
/experiments/single_test/argument_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: argument_test
 6 |     Author: czh
 7 |     Create Date: 2021/11/11
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from dataclasses import dataclass, field
13 | from transformers import HfArgumentParser
14 | 
15 | from nlp.arguments import TrainingArguments, ModelArguments, DataArguments
16 | 
17 | 
18 | @dataclass
19 | class MyArgument:
20 |     early_stop: bool = field(default=False)
21 |     patience: int = field(default=5, metadata={"help": "早停的轮数"})
22 | 
23 | 
24 | parser = HfArgumentParser((DataArguments, ModelArguments, TrainingArguments, MyArgument))
25 | parser.print_help()
26 | args = parser.parse_args()
27 | # print(args.patience)
28 | args.device = 'cpu'
29 | print(args.device)
30 | 


--------------------------------------------------------------------------------
/experiments/single_test/bart_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: bart_test
 6 |     Author: czh
 7 |     Create Date: 2022/1/25
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import torch
13 | from transformers import MBart50TokenizerFast,MBartForConditionalGeneration
14 | 
15 | device = torch.device('cpu')
16 | long_texts = """《"十四五"现代综合交通运输体系发展规划》（以下简称《规划》）日前向社会公布。《规划》明确，到2025年，综合交通运输基本实现一体化融合发展，智能化、绿色化取得实质性突破，综合能力、服务品质、运行效率和整体效益显著提升，交通运输发展向世界一流水平迈进。
17 | 展望2035年，"全国123出行交通圈"（都市区1小时通勤、城市群2小时通达、全国主要城市3小时覆盖）和"全球123快货物流圈"（快货国内1天送达、周边国家2天送达、全球主要城市3天送达）基本形成，基本建成交通强国。
18 | 交通建设更提升
19 | "十四五"时期，将新增铁路营业里程1.9万公里、公路通车里程30.2万公里，民用运输机场将超270个
20 | 交通运输是国民经济中的基础性、先导性、战略性产业，是重要的服务性行业和现代化经济体系的重要组成部分，是构建新发展格局的重要支撑和服务人民美好生活、促进共同富裕的坚实保障。
21 | "十三五"时期，我国综合交通网络总里程突破600万公里，"十纵十横"综合运输大通道基本贯通，高速铁路运营里程翻一番，高速公路对20万人口以上城市覆盖率超过98%，民用运输机场覆盖92%左右的地级市，超大特大城市轨道交通加快成网……综合交通运输体系建设取得了历史性成就。
22 | "‘十四五’时期是加快推进交通强国建设、构建现代综合交通运输体系的关键五年。"国家发展改革委基础设施发展司司长罗国三说。
23 | 《规划》明确，"十四五"时期，我国将新增铁路营业里程1.9万公里、公路通车里程30.2万公里、内河高等级航道里程2400公里，民用运输机场达到270个以上，城市轨道交通运营里程达到1万公里左右，高速铁路网对50万人口以上城市覆盖率达到95%以上，普速铁路瓶颈路段基本消除，"71118"国家高速公路主线基本贯通，现代化机场体系基本形成，综合立体交通网的规模、能力、质量和运行效率持续提升。
24 | "现代化高质量综合立体交通网是经济社会发展的基础支撑。《规划》提出，要构建完善以‘十纵十横’综合运输大通道为骨干，以综合交通枢纽为支点，以快速网、干线网、基础网多层次网络为依托的综合交通网络，勾画好美丽中国‘交通工笔画’。"罗国三说。
25 | 为完善综合运输大通道，《规划》提出，打造沿海、沿江、沿边、出疆、入藏和西部陆海新通道等6条战略骨干通道，建设多层级一体化综合交通枢纽。
26 | 百姓出行更便利
27 | 着力填补西部铁路"留白"，推动西部地区普通国道二级及以上公路比重达70%
28 | 人民交通为人民。"十四五"时期，百姓出行有望享受更多新便利。
29 | 均衡性提升。根据《规划》，将推动西部地区普通国道二级及以上公路比重达70%，推动较大人口规模自然村（组）通硬化路比例超过85%；加快城乡客运一体化发展，持续巩固拓展具备条件的乡镇和建制村通客车成果；进一步改善轮渡通行条件，方便边远地区群众日常出行；提升农村物流服务水平，到2025年，在全国推广100个左右农村物流服务品牌。
30 | 西部铁路"留白"也将被着力填补。国铁集团发展改革部副主任赵长江介绍，"十四五"期间，将统筹推进中西部地区铁路网建设，加强边疆地区铁路网建设，提高革命老区、民族地区和欠发达地区铁路网络密度。将着力构建多向入藏通道、完善出疆对外通道，加快推动新藏铁路前期工作，创造条件启动重点路段建设。推进青藏铁路升级改造，开展滇藏铁路重点路段前期研究。推动疆内铁路环起来，进出疆铁路畅起来。提高重点边境地区铁路覆盖，提升东北边境地区路网质量，完善西南边境地区路网布局。
31 | 便利性提升。《规划》提出，要完善枢纽集疏运体系，到2025年，实现沿海港口重要港区铁路进港率超过70%，枢纽机场轨道交通接入率达80%。加快发展旅客联程运输，将在50个城市组织开展旅客联程运输试点。普及道路客运电子客票应用，努力推进一站购票、一票通行。
32 | "预计2025年底，全国铁路营业里程将达16.5万公里左右，其中高速铁路（含部分城际铁路）5万公里左右、覆盖95%以上的50万人口以上城市，基本形成‘全国123高铁出行圈’，更好满足人们美好出行需要。"赵长江说。
33 | 经济性提升。《规划》指出，要持续优化运输结构，提高综合交通运输网络效率。规范交通运输新业态、新模式价格管理，健全巡游出租汽车价格形成机制，深化道路客运价格市场化改革。落实物流减税降费措施。
34 | 多样性提升。《规划》要求，引导和规范网约车、共享单车、汽车分时租赁健康发展。加快运输旅游融合发展，鼓励道路客运站拓展旅游集散服务功能。"""
35 | 
36 | tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
37 | model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
38 | model.to(device)
39 | 
40 | article_input_ids = tokenizer.batch_encode_plus([long_texts], return_tensors='pt',
41 |                                                 max_length=1024)['input_ids'].to(device)
42 | summary_ids = model.generate(article_input_ids, num_beams=4, length_penalty=2.0, max_length=142, min_length=56,
43 |                              no_repeat_ngram_size=3)
44 | 
45 | summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
46 | print(summary_txt)
47 | 


--------------------------------------------------------------------------------
/experiments/single_test/bert_crop_model_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: bert_crop_model_test
 6 |     Author: czh
 7 |     Create Date: 2021/8/10
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from transformers import BertConfig, BertPreTrainedModel, BertModel
13 | from nlp.models.bertcrop import BertCropModel, save_specify_num_hidden_layers_state
14 | 
15 | 
16 | bert_model_path = "/Users/czh/Downloads/chinese_bert_wwm"
17 | bert_config = BertConfig.from_json_file(bert_model_path+'/config.json')
18 | bert_config.num_hidden_layers = 1
19 | 
20 | # state_dict = torch.load(bert_model_path+'/pytorch_model.bin')
21 | # bert_model.init_from_pretrained(state_dict)
22 | 
23 | 
24 | class MyModel(BertPreTrainedModel):
25 |     def __init__(self, config, pretrained_bert_path):
26 |         super(MyModel, self).__init__(config)
27 | 
28 |         self.bert = BertCropModel(config)
29 |         # state_dict = torch.load(pretrained_bert_path+'/pytorch_model.bin')
30 |         # init_from_pretrained(self.bert, state_dict, True)
31 | 
32 | 
33 | bert_model_ = BertModel.from_pretrained(bert_model_path)
34 | save_specify_num_hidden_layers_state(bert_model_, [1], "./pytorch_model_0_layer.bin")
35 | 
36 | bert_model = MyModel.from_pretrained("./pytorch_model_0_layer.bin", config=bert_config, pretrained_bert_path=bert_model_path)
37 | 
38 | for n, p in bert_model.named_parameters():
39 |     print(n)
40 | 


--------------------------------------------------------------------------------
/experiments/single_test/chatglm_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | --------------------------------
 5 | Author：czh
 6 | date：2023/3/24
 7 | --------------------------------
 8 | """
 9 | from transformers import AutoModel, AutoTokenizer
10 | 
11 | device = "cuda:3"
12 | model_path = "/root/work2/work2/chenzhihao/pretrained_models/chatglm-6b"
13 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, revision=True)
14 | # model = AutoModel.from_pretrained(model_path, trust_remote_code=True, revision=True).half().to('mps')
15 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True, revision=True).half().to(device)
16 | model = model.eval()
17 | 
18 | history = []
19 | while True:
20 |     query = input("\nuser(q to stop): ")
21 |     if query.strip() == 'q':
22 |         break
23 | 
24 |     response, history = model.chat(tokenizer, query.strip(), history=history)
25 |     print("\nresponse: ", response)
26 | 


--------------------------------------------------------------------------------
/experiments/single_test/data_selection_for_training_dynamics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email: zhihao.chen@kuwo.cn
 6 | @date: 2022/9/21 19:22
 7 | """
 8 | # 根据metric来选择不同的子训练集
 9 | # 参考自：https://github.com/beyondguo/TrainingDynamics/blob/master/data_selection.py
10 | # Only applied to training set
11 | # python data_selection.py --task_name qnli --model_name bert-base-cased --proportion 0.5 --burn_out 4
12 | import json
13 | import random
14 | 
15 | random.seed(1)
16 | import argparse
17 | 
18 | from train_dynamics_filtering import read_training_dynamics, compute_train_dy_metrics
19 | 
20 | 
21 | class Config:
22 |     task_name = 'BQ'
23 |     model_name = "roberta-wwm"
24 |     proportion = 0.33
25 |     burn_out = 5
26 | 
27 | 
28 | parser = argparse.ArgumentParser()
29 | parser.add_argument("--task_name", type=str)
30 | parser.add_argument("--model_name", type=str)
31 | parser.add_argument("--proportion", type=float, default=0.33)
32 | parser.add_argument("--burn_out", type=int)
33 | # args = parser.parse_args()
34 | args = Config()
35 | 
36 | TASK_NAME = args.task_name
37 | MODEL = args.model_name
38 | PROPORTION = args.proportion
39 | LOG_PATH = '/root/work2/work2/chenzhihao/NLP/output_file_dir/semantic_match'
40 | 
41 | # 读取并合并到一个文件
42 | td = read_training_dynamics(LOG_PATH + f'dy_logs/{TASK_NAME}/{MODEL}/')
43 | # 计算 metrics，转化成一个 dataframe
44 | td_df, _ = compute_train_dy_metrics(td, burn_out=args.burn_out)
45 | 
46 | 
47 | def consider_ascending_order(filtering_metric: str) -> bool:
48 |     """
49 |     Determine if the metric values' sorting order to get the most `valuable` examples for training.
50 |     """
51 |     if filtering_metric == "variability":
52 |         return False
53 |     elif filtering_metric == "confidence":
54 |         return True
55 |     elif filtering_metric == "threshold_closeness":
56 |         return False
57 |     elif filtering_metric == "forgetfulness":
58 |         return False
59 |     elif filtering_metric == "correctness":
60 |         return True
61 |     else:
62 |         raise NotImplementedError(f"Filtering based on {filtering_metric} not implemented!")
63 | 
64 | 
65 | def data_selection(metric, select_worst, proportion, shuffle=True):
66 |     ascending = consider_ascending_order(metric)
67 |     if select_worst:
68 |         ascending = not consider_ascending_order(metric)
69 |     sorted_df = td_df.sort_values(by=metric, ascending=ascending)
70 |     selected_df = sorted_df.head(n=int(proportion * len(sorted_df)))
71 |     indices = list(selected_df['guid'])
72 |     if shuffle:
73 |         random.shuffle(indices)
74 |     return {'indices': indices, 'df': selected_df}
75 | 
76 | 
77 | """
78 | 选择hard-to-learn的数据，设置METRIC = 'confidence'
79 | 选择easy-to-learn的数据，设置METRIC = 'confidence', SELECT_WORST = True
80 | 选择ambiguoug的数据，设置METRIC = 'variability'
81 | """
82 | 
83 | three_regions_data_indices = {'hard': data_selection('confidence', False, PROPORTION)['indices'],
84 |                               'easy': data_selection('confidence', True, PROPORTION)['indices'],
85 |                               'ambiguous': data_selection('variability', False, PROPORTION)['indices']}
86 | 
87 | with open(LOG_PATH + f'dy_logs/{TASK_NAME}/{MODEL}/three_regions_data_indices.json', 'w', encoding='utf8') as f:
88 |     f.write(json.dumps(three_regions_data_indices, ensure_ascii=False))
89 | 
90 | # 然后可以直接跑glue任务，在选择训练集的时候，使用select函数来指定对应样本即可：
91 | """ e.g.
92 | from datasets import load_dataset
93 | raw_datasets = load_dataset('glue','sst2')
94 | easy_train_set = raw_datasets['train'].select(three_regions_data_indices['easy'])
95 | """
96 | 


--------------------------------------------------------------------------------
/experiments/single_test/imagen_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email: zhihao.chen@kuwo.cn
 6 | @date: 2022/6/17 14:53
 7 | """
 8 | import torch
 9 | from imagen_pytorch import Unet, Imagen
10 | 
11 | device = torch.device('cpu')
12 | # unet for imagen
13 | 
14 | unet1 = Unet(
15 |     dim=32,
16 |     cond_dim=512,
17 |     dim_mults=(1, 2, 4, 8),
18 |     num_resnet_blocks=3,
19 |     layer_attns=(False, True, True, True),
20 |     layer_cross_attns=(False, True, True, True)
21 | )
22 | 
23 | unet2 = Unet(
24 |     dim=32,
25 |     cond_dim=512,
26 |     dim_mults=(1, 2, 4, 8),
27 |     num_resnet_blocks=(2, 4, 8, 8),
28 |     layer_attns=(False, False, False, True),
29 |     layer_cross_attns=(False, False, False, True)
30 | )
31 | 
32 | # imagen, which contains the unets above (base unet and super resoluting ones)
33 | 
34 | imagen = Imagen(
35 |     unets=(unet1, unet2),
36 |     image_sizes=(64, 256),
37 |     timesteps=1000,
38 |     cond_drop_prob=0.1
39 | ).to(device)
40 | 
41 | # mock images (get a lot of this) and text encodings from large T5
42 | 
43 | text_embeds = torch.randn(4, 256, 768).to(device)
44 | text_masks = torch.ones(4, 256).bool().to(device)
45 | images = torch.randn(4, 3, 256, 256).to(device)
46 | 
47 | # feed images into imagen, training each unet in the cascade
48 | 
49 | for i in (1, 2):
50 |     loss = imagen(images, text_embeds=text_embeds, text_masks=text_masks, unet_number=i)
51 |     loss.backward()
52 | 
53 | # do the above for many many many many steps
54 | # now you can sample an image based on the text embeddings from the cascading ddpm
55 | 
56 | images = imagen.sample(texts=[
57 |     'a whale breaching from afar',
58 |     'young girl blowing out candles on her birthday cake',
59 |     'fireworks with blue and green sparkles'
60 | ], cond_scale=3.)
61 | 
62 | print(images.shape)  # (3, 3, 256, 256)
63 | 


--------------------------------------------------------------------------------
/experiments/single_test/inference_liadrinz_unilm_for_seq2seq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2023/2/7 18:29
 7 | """
 8 | import os
 9 | import sys
10 | 
11 | import torch
12 | dirname = os.path.dirname(os.path.abspath(__file__))
13 | print(dirname)
14 | sys.path.append(os.path.join('/'.join(dirname.split('/')[:-2])))
15 | from nlp.models.unilm_model_liadrinz import UniLMForConditionalGeneration
16 | from nlp.processors.unilm_liadrinz_processor import DataCollatorForUniLMSeq2Seq, CorpusDataset
17 | from nlp.utils.tokenization_unilm import UniLMTokenizerLiadrinz
18 | 
19 | project_path = "/root/work2/work2/chenzhihao/NLP/"
20 | model_name_or_path = project_path + "datas/output_dir/unilm/liadrinz_unilm/seq2seq_on_natural_conv/checkpoint-1500"
21 | device = "cuda:7"
22 | TOP_K = 0
23 | TOP_P = 0.9
24 | TEMPERATURE = 0.7
25 | DO_SAMPLE = True
26 | OUTPUT_MAX_LENGTH = 32
27 | OUTPUT_MIN_LENGTH = 1
28 | PREFIX = "用户："
29 | POSTFIX = " 机器人："
30 | 
31 | 
32 | def interact(tokenizer: UniLMTokenizerLiadrinz, model: UniLMForConditionalGeneration):
33 |     history = ""
34 |     while True:
35 |         raw_text = input("\n输入：")
36 |         while not raw_text:
37 |             print('Prompt should not be empty!')
38 |             raw_text = input("\n输入：")
39 |         raw_text = raw_text.strip()
40 |         if raw_text == "stop":
41 |             break
42 |         history += PREFIX + raw_text + "。" + POSTFIX
43 |         inputs = tokenizer(history, return_tensors='pt')
44 |         for k in inputs:
45 |             inputs[k] = inputs[k].to(device)
46 |         with torch.no_grad():
47 |             if DO_SAMPLE:
48 |                 output_ids = model.generate(**inputs,
49 |                                             max_new_tokens=OUTPUT_MAX_LENGTH,
50 |                                             min_length=OUTPUT_MIN_LENGTH,
51 |                                             top_k=TOP_K,
52 |                                             top_p=TOP_P,
53 |                                             temperature=TEMPERATURE,
54 |                                             do_sample=True,
55 |                                             no_repeat_ngram_size=3)
56 |             else:
57 |                 output_ids = model.generate(**inputs, max_new_tokens=OUTPUT_MAX_LENGTH, num_beams=1, length_penalty=0.6)
58 |         output_text = tokenizer.decode(output_ids[0])
59 |         result = output_text.split("[SEP]")[1].strip()
60 |         print(result)
61 |         result = "".join(result.split())
62 |         result = result.split(PREFIX.replace("：", ":"))[0]
63 |         print("\n回复：", result)
64 |         history += result
65 |         print(history)
66 |         if len(history) > 512:
67 |             history = ""
68 | 
69 | 
70 | def main():
71 |     tokenizer = UniLMTokenizerLiadrinz.from_pretrained(model_name_or_path)
72 |     model = UniLMForConditionalGeneration.from_pretrained(model_name_or_path)
73 |     model.to(device)
74 | 
75 |     interact(tokenizer, model)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/experiments/single_test/paddlenlp_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email: zhihao.chen@kuwo.cn
 6 | @date: 2022/6/7 19:27
 7 | """
 8 | from pprint import pprint
 9 | from paddlenlp import Taskflow
10 | 
11 | schema = ['时间', '选手', '赛事名称']  # Define the schema for entity extraction
12 | ie = Taskflow('information_extraction', schema=schema)
13 | pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌！"))
14 | 


--------------------------------------------------------------------------------
/experiments/single_test/position_embedding_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: position_embedding_test
 6 |     Author: czh
 7 |     Create Date: 2021/8/6
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from transformers import BertTokenizerFast
13 | from nlp.models.model_util import _generate_relative_positions_embeddings
14 | 
15 | bert_model_name_or_path = "/data/chenzhihao/chinese-roberta-ext"
16 | tokenizer = BertTokenizerFast.from_pretrained(bert_model_name_or_path)
17 | text = ["俄罗斯卫星网刚刚消息称，美军在喀布尔机场向阿富汗平民开火，已致数人死亡。"]
18 | ids = tokenizer.batch_encode_plus(text, return_tensors='pt', max_length=128, padding="max_length")
19 | # print(ids)
20 | input_ids = ids["input_ids"]
21 | token_type_ids = ids["token_type_ids"]
22 | print(token_type_ids)
23 | 
24 | # embedding = PositionEmbedding(128, 768, merge_mode='zero', hierarchical=True, embeddings_initializer='xavier_uniform')
25 | # embedding = SinusoidalPositionEmbedding(output_dim=768, merge_mode='zero')
26 | # embedding = RoFormerSinusoidalPositionalEmbedding(128, 768)
27 | # embedding = RelativePositionEmbedding(128*2+1, 768)
28 | # embedding = RelativePositionEmbeddingT5(input_dim=128*2+1, output_dim=768)
29 | # pos = embedding(input_ids)
30 | 
31 | # lm = LM_Mask()
32 | # pos = lm.lm_mask(64, 64)
33 | 
34 | # ulm = UniLM_Mask()
35 | # pos = ulm.unilm_mask(token_type_ids, 128-token_type_ids.size(1))
36 | 
37 | pos = _generate_relative_positions_embeddings(seq_length=128, embed_dim=64, max_relative_position=128)
38 | print(pos)
39 | print(pos.size())
40 | 
41 | 


--------------------------------------------------------------------------------
/experiments/single_test/roformer_sim_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/11/18 16:32
 7 | """
 8 | # https://github.com/JunnYu/RoFormer_pytorch
 9 | import torch
10 | import numpy as np
11 | from roformer import RoFormerForCausalLM, RoFormerConfig
12 | from transformers import BertTokenizer
13 | 
14 | 
15 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
16 | # 可选以下几个。
17 | # junnyu/roformer_chinese_sim_char_small, junnyu/roformer_chinese_sim_char_base
18 | # junnyu/roformer_chinese_sim_char_ft_small, roformer_chinese_sim_char_ft_base
19 | pretrained_model = "junnyu/roformer_chinese_sim_char_base"
20 | tokenizer = BertTokenizer.from_pretrained(pretrained_model)
21 | config = RoFormerConfig.from_pretrained(pretrained_model)
22 | config.is_decoder = True
23 | config.eos_token_id = tokenizer.sep_token_id
24 | config.pooler_activation = "linear"
25 | model = RoFormerForCausalLM.from_pretrained(pretrained_model, config=config)
26 | model.to(device)
27 | model.eval()
28 | 
29 | 
30 | def gen_synonyms(text, n=100, k=20):
31 |     """
32 |     含义： 产生sent的n个相似句，然后返回最相似的k个。
33 |     做法：用seq2seq生成，并用encoder算相似度并排序。
34 |     """
35 |     # 寻找所有相似的句子
36 |     r = []
37 |     inputs1 = tokenizer(text, return_tensors="pt")
38 |     for _ in range(n):
39 |         inputs1.to(device)
40 |         output = tokenizer.batch_decode(model.generate(**inputs1,
41 |                                                        top_p=0.95,
42 |                                                        do_sample=True,
43 |                                                        max_length=128),
44 |                                         skip_special_tokens=True)[0].replace(" ", "").replace(text,
45 |                                                                                               "")  # 去除空格，去除原始text文本。
46 |         r.append(output)
47 | 
48 |     # 对相似的句子进行排序
49 |     r = [i for i in set(r) if i != text and len(i) > 0]
50 |     r = [text] + r
51 |     inputs2 = tokenizer(r, padding=True, return_tensors="pt")
52 |     with torch.no_grad():
53 |         inputs2.to(device)
54 |         outputs = model(**inputs2)
55 |         Z = outputs.pooler_output.cpu().numpy()
56 |     Z /= (Z ** 2).sum(axis=1, keepdims=True) ** 0.5
57 |     argsort = np.dot(Z[1:], -Z[0]).argsort()
58 | 
59 |     return [r[i + 1] for i in argsort[:k]]
60 | 
61 | 
62 | out = gen_synonyms("已经扫码支付，会员季度费18元")
63 | print(out)
64 | 


--------------------------------------------------------------------------------
/experiments/single_test/roformer_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: roformer_test
 6 |     Author: czh
 7 |     Create Date: 2021/9/3
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import torch
13 | from transformers import RoFormerModel, RoFormerForMaskedLM, RoFormerTokenizer
14 | 
15 | 
16 | text = "时光流逝，但有些日子注定被永久铭记。在抗战胜利76周年纪念日到来之际，各地举办了形式多样的活动，让人们在回望历史中收获心灵的洗礼、得到思想的升华。河南南阳市当地媒体推出“纪念抗战胜利76周年 山河同在”系列报道，与读者一起回望气壮山河的抗日史诗，凝聚兴我中华的磅礴伟力；甘肃永昌县中小学的“开学第一课”以牢记历史为切入点，通过观看抗战专题视频、抗战知识问答等形式，回顾中国人民艰苦抗战的峥嵘岁月；广西桂林市举办纪念抗战胜利76周年文艺演出活动,百余位党员群众追忆革命先辈的艰辛历程，讴歌永远跟党走的坚定誓言。历史是最好的教科书，也是最好的清醒剂。从1931年日本军国主义的铁蹄蹂躏中国东北的白山黑水，到1945年9月2日，日本代表在无条件投降书上签字，十四年抗战的血与火背后，是3500多万同胞伤亡，930余座城市先后被占，4200万难民无家可归。于民族危难之际，中国共产党支撑起救亡图存的希望。从打破“日军不可战胜”神话的平型关大捷，到粉碎侵略者“囚笼政策”的百团大战；从让日军“名将之花”凋谢在太行山上的黄土岭战斗，到打响华中反攻第一枪的车桥战役，——在中国共产党的领导下，无数不甘屈辱的中华儿女前赴后继，以血肉之躯筑起新的长城，赢得了自1840年鸦片战争以来抗击外敌入侵的第一次完全胜利！为争取世界和平的伟大事业，作出了永载史册的重大贡献！战争硝烟早已散去，苦难岁月还需铭记，并非是要背着包袱前行，而是只有牢记来时的路，才能走向更远的前方。正如联合国的呼吁：“我们有责任见证苦难永远不再重演，受难者的记忆被永久尊重。”我们永远不会忘记，“名将以身殉国家，愿拼热血卫吾华”的左权，“未惜头颅新故国，甘将热血沃中华”的赵一曼，弹尽后毅然投江的八名抗联女兵，打完最后一粒子弹后壮烈跳崖的狼牙山五壮士……岁月长河，历史足迹不容磨灭；时代变迁，英雄精神熠熠发光。当76年前的历史场景在这一天再次重现，当战争创伤在和平年代只能靠记忆的方式还原，每一个中华儿女都已然在心中默默地葆有一份肃穆与庄重。"
17 | model_name = "junnyu/roformer_chinese_base"
18 | tokenizer = RoFormerTokenizer.from_pretrained(model_name)
19 | pt_model = RoFormerForMaskedLM.from_pretrained(model_name)
20 | input_ids = tokenizer(text, return_tensors='pt')
21 | 
22 | with torch.no_grad():
23 |     pt_outputs = pt_model(**input_ids).logits[0]
24 | pt_outputs_sentence = "pytorch: "
25 | for i, id in enumerate(tokenizer.encode(text)):
26 |     if id == tokenizer.mask_token_id:
27 |         tokens = tokenizer.convert_ids_to_tokens(pt_outputs[i].topk(k=5)[1])
28 |         pt_outputs_sentence += "[" + "||".join(tokens) + "]"
29 |     else:
30 |         pt_outputs_sentence += "".join(
31 |             tokenizer.convert_ids_to_tokens([id], skip_special_tokens=True))
32 | print(pt_outputs_sentence)
33 | 
34 | 


--------------------------------------------------------------------------------
/experiments/single_test/unlim_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2023/2/2 18:58
 7 | """
 8 | 
 9 | # https://github.com/Liadrinz/transformers-unilm
10 | from tqdm import tqdm
11 | 
12 | from transformers.trainer_seq2seq import Seq2SeqTrainer
13 | from transformers.training_args import TrainingArguments
14 | 
15 | from nlp.utils.tokenization_unilm import UniLMTokenizerLiadrinz as UniLMTokenizer
16 | from nlp.models.unilm_model_liadrinz import UniLMForConditionalGeneration
17 | from nlp.processors.unilm_liadrinz_processor import DataCollatorForUniLMSeq2Seq, Seq2SeqDataset
18 | 
19 | 
20 | # 中文摘要任务生成
21 | news_article = (
22 |     "12月23日，河北石家庄。8岁哥哥轻车熟路哄睡弟弟，姿势标准动作熟练。"
23 |     "妈妈杨女士表示：哥哥很喜欢弟弟，因为心思比较细，自己平时带孩子的习惯他都会跟着学习，"
24 |     "哄睡孩子也都会争着来，技巧很娴熟，两人在一块很有爱，自己感到很幸福，平时帮了自己很大的忙，感恩有这么乖的宝宝。"
25 | )
26 | 
27 | tokenizer = UniLMTokenizer.from_pretrained("Yuang/unilm-base-chinese-news-sum")
28 | model = UniLMForConditionalGeneration.from_pretrained("Yuang/unilm-base-chinese-news-sum") # 在微博新闻摘要数据上fine-tune过的模型
29 | 
30 | inputs = tokenizer(news_article, return_tensors="pt")
31 | output_ids = model.generate(**inputs, max_new_tokens=16)
32 | output_text = tokenizer.decode(output_ids[0])
33 | print(output_text)  # "[CLS] <news_article> [SEP] <news_summary> [SEP]"
34 | news_summary = output_text.split("[SEP]")[1].strip()
35 | print(news_summary)
36 | 
37 | # 训练
38 | tokenizer = UniLMTokenizer.from_pretrained("microsoft/unilm-base-cased")
39 | dataset = Seq2SeqDataset(tokenizer, "train.src", "train.tgt", max_src_len=448, max_tgt_len=64)
40 | collator = DataCollatorForUniLMSeq2Seq(tokenizer, mlm=True, mlm_probability=0.7)
41 | model = UniLMForConditionalGeneration.from_pretrained("microsoft/unilm-base-cased")
42 | training_args = TrainingArguments(
43 |     output_dir="output_dir",
44 |     do_train=True,
45 |     per_device_train_batch_size=4,
46 |     gradient_accumulation_steps=2,
47 |     learning_rate=1e-4,
48 |     num_train_epochs=3,
49 | )
50 | trainer = Seq2SeqTrainer(
51 |     model,
52 |     args=training_args,
53 |     data_collator=collator,
54 |     train_dataset=dataset,
55 |     tokenizer=tokenizer,
56 | )
57 | trainer.train()
58 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Emelyanov Anton
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/nlp/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__.py
 6 |     Author: czh
 7 |     Create Date: 2021/9/10
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/arguments/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__.py
 6 |     Author: czh
 7 |     Create Date: 2021/11/11
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from nlp.arguments.train_arguments import TrainingArguments
13 | from nlp.arguments.data_arguments import DataArguments
14 | from nlp.arguments.model_arguments import ModelArguments
15 | 


--------------------------------------------------------------------------------
/nlp/arguments/model_arguments.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: model_arguments
 6 |     Author: czh
 7 |     Create Date: 2021/11/11
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from dataclasses import dataclass, field
13 | from typing import Optional
14 | 
15 | 
16 | @dataclass
17 | class ModelArguments:
18 |     """
19 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
20 |     """
21 | 
22 |     model_name_or_path: str = field(
23 |         default="hfl/chinese-roberta-wwm-ext",
24 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
25 |     )
26 | 
27 |     config_name: Optional[str] = field(
28 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
29 |     )
30 |     tokenizer_name: Optional[str] = field(
31 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
32 |     )
33 |     cache_dir: Optional[str] = field(
34 |         default=None,
35 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
36 |     )
37 |     use_fast_tokenizer: bool = field(
38 |         default=True,
39 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
40 |     )
41 | 
42 |     use_lstm: bool = field(
43 |         default=False,
44 |         metadata={"help": "Whether or not to use lstm behind lm model"}
45 |     )
46 |     dropout_rate: float = field(default=0.5)
47 |     crf_learning_rate: float = field(default=3e-3)
48 |     model_type: str = field(
49 |         default="bert",
50 |         metadata={"help": "Specify the encoder type.", "choices": ["bert", "nezha", "roformer", "albert"]}
51 |     )
52 |     do_adv: bool = field(
53 |         default=False,
54 |         metadata={"help": "Whether to adversarial training."}
55 |     )
56 |     adv_epsilon: float = field(
57 |         default=1.0,
58 |         metadata={"help": "Epsilon for adversarial."}
59 |     )
60 |     adv_name: str = field(default='word_embeddings', metadata={"help": "name for adversarial layer."})
61 |     soft_label: bool = field(default=False)
62 |     loss_type: str = field(default="ce", metadata={"help": "Loss function", "choices": ['lsr', 'focal', 'ce']})
63 | 
64 |     # myparams
65 |     reserve_p: float = field(
66 |         default=1.0,
67 |         metadata={"help": "Will use when use child-tuning"}
68 |     )
69 |     mode: str = field(
70 |         default=None,
71 |         metadata={"help": "Specify what mode will be used for Child-Tuning. eg:'ChildTuning-D', 'ChildTuning-F'"}
72 |     )
73 |     rdrop_alpha: int = field(default=5, metadata={"help": "Rdrop alpha value, only when use rdrop"})
74 |     rope: bool = field(default=False, metadata={"help": "Whether use RoPositionEmbedding or not"})
75 | 


--------------------------------------------------------------------------------
/nlp/callback/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/nlp/callback/__init__.py


--------------------------------------------------------------------------------
/nlp/callback/optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/nlp/callback/optimizers/__init__.py


--------------------------------------------------------------------------------
/nlp/callback/optimizers/novograd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | from torch.optim.optimizer import Optimizer
 4 | 
 5 | 
 6 | class NovoGrad(Optimizer):
 7 |     """Implements NovoGrad algorithm.
 8 |     Arguments:
 9 |         params (iterable): iterable of parameters to optimize or dicts defining
10 |             parameter groups
11 |         lr (float, optional): learning rate (default: 1e-2)
12 |         betas (Tuple[float, float], optional): coefficients used for computing
13 |             running averages of gradient and its square (default: (0.95, 0.98))
14 |         eps (float, optional): term added to the denominator to improve
15 |             numerical stability (default: 1e-8)
16 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
17 |     Example:
18 |         model = ResNet()
19 |         optimizer = NovoGrad(model.parameters(), lr=1e-2, weight_decay=1e-5)
20 |     """
21 | 
22 |     def __init__(self, params, lr=0.01, betas=(0.95, 0.98), eps=1e-8,
23 |                  weight_decay=0, grad_averaging=False):
24 |         if lr < 0.0:
25 |             raise ValueError("Invalid learning rate: {}".format(lr))
26 |         if not 0.0 <= betas[0] < 1.0:
27 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
28 |         if not 0.0 <= betas[1] < 1.0:
29 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
30 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging)
31 |         super().__init__(params, defaults)
32 | 
33 |     def step(self, closure=None):
34 |         loss = None
35 |         if closure is not None:
36 |             loss = closure()
37 |         for group in self.param_groups:
38 |             for p in group['params']:
39 |                 if p.grad is None:
40 |                     continue
41 |                 grad = p.grad.data
42 |                 if grad.is_sparse:
43 |                     raise RuntimeError('NovoGrad does not support sparse gradients')
44 |                 state = self.state[p]
45 |                 g_2 = torch.sum(grad ** 2)
46 |                 if len(state) == 0:
47 |                     state['step'] = 0
48 |                     state['moments'] = grad.div(g_2.sqrt() + group['eps']) + group['weight_decay'] * p.data
49 |                     state['grads_ema'] = g_2
50 |                 moments = state['moments']
51 |                 grads_ema = state['grads_ema']
52 |                 beta1, beta2 = group['betas']
53 |                 state['step'] += 1
54 |                 grads_ema.mul_(beta2).add_(1 - beta2, g_2)
55 | 
56 |                 denom = grads_ema.sqrt().add_(group['eps'])
57 |                 grad.div_(denom)
58 |                 # weight decay
59 |                 if group['weight_decay'] != 0:
60 |                     decayed_weights = torch.mul(p.data, group['weight_decay'])
61 |                     grad.add_(decayed_weights)
62 | 
63 |                 # Momentum --> SAG
64 |                 if group['grad_averaging']:
65 |                     grad.mul_(1.0 - beta1)
66 | 
67 |                 moments.mul_(beta1).add_(grad)  # velocity
68 | 
69 |                 bias_correction1 = 1 - beta1 ** state['step']
70 |                 bias_correction2 = 1 - beta2 ** state['step']
71 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
72 |                 p.data.add_(-step_size, moments)
73 | 
74 |         return loss
75 | 


--------------------------------------------------------------------------------
/nlp/callback/optimizers/planradam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | from torch.optim.optimizer import Optimizer
 4 | 
 5 | 
 6 | class PlainRAdam(Optimizer):
 7 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 8 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 9 | 
10 |         super(PlainRAdam, self).__init__(params, defaults)
11 | 
12 |     def __setstate__(self, state):
13 |         super(PlainRAdam, self).__setstate__(state)
14 | 
15 |     def step(self, closure=None):
16 | 
17 |         loss = None
18 |         if closure is not None:
19 |             loss = closure()
20 | 
21 |         for group in self.param_groups:
22 | 
23 |             for p in group['params']:
24 |                 if p.grad is None:
25 |                     continue
26 |                 grad = p.grad.data.float()
27 |                 if grad.is_sparse:
28 |                     raise RuntimeError('RAdam does not support sparse gradients')
29 | 
30 |                 p_data_fp32 = p.data.float()
31 | 
32 |                 state = self.state[p]
33 | 
34 |                 if len(state) == 0:
35 |                     state['step'] = 0
36 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
37 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
38 |                 else:
39 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
40 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
41 | 
42 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
43 |                 beta1, beta2 = group['betas']
44 | 
45 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
46 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
47 | 
48 |                 state['step'] += 1
49 |                 beta2_t = beta2 ** state['step']
50 |                 n_sma_max = 2 / (1 - beta2) - 1
51 |                 n_sma = n_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
52 | 
53 |                 if group['weight_decay'] != 0:
54 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
55 | 
56 |                 # more conservative since it's an approximated value
57 |                 if n_sma >= 5:
58 |                     step_size = group['lr'] * math.sqrt((1 - beta2_t) * (n_sma - 4) / (n_sma_max - 4) *
59 |                                                         (n_sma - 2) / n_sma *
60 |                                                         n_sma_max / (n_sma_max - 2)) / (1 - beta1 ** state['step'])
61 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
62 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
63 |                 else:
64 |                     step_size = group['lr'] / (1 - beta1 ** state['step'])
65 |                     p_data_fp32.add_(-step_size, exp_avg)
66 | 
67 |                 p.data.copy_(p_data_fp32)
68 | 
69 |         return loss
70 | 


--------------------------------------------------------------------------------
/nlp/callback/progressbar.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import Dict
 3 | 
 4 | 
 5 | class ProgressBar(object):
 6 |     """
 7 |     custom progress bar
 8 |     Example:
 9 |         pbar = ProgressBar(n_total=30,desc='training')
10 |         step = 2
11 |         pbar(step=step)
12 |     """
13 |     def __init__(self, n_total, width=30, desc='Training'):
14 |         self.width = width
15 |         self.n_total = n_total
16 |         self.start_time = time.time()
17 |         self.desc = desc
18 | 
19 |     def __call__(self, step, info: Dict = None):
20 |         now = time.time()
21 |         current = step + 1
22 |         recv_per = current / self.n_total
23 |         bar = f'[{self.desc}] {current}/{self.n_total} ['
24 |         if recv_per >= 1:
25 |             recv_per = 1
26 |         prog_width = int(self.width * recv_per)
27 |         if prog_width > 0:
28 |             bar += '=' * (prog_width - 1)
29 |             if current < self.n_total:
30 |                 bar += ">"
31 |             else:
32 |                 bar += '='
33 |         bar += '.' * (self.width - prog_width)
34 |         bar += ']'
35 |         show_bar = f"\r{bar}"
36 |         time_per_unit = (now - self.start_time) / current
37 |         if current < self.n_total:
38 |             eta = time_per_unit * (self.n_total - current)
39 |             if eta > 3600:
40 |                 eta_format = ('%d:%02d:%02d' %
41 |                               (eta // 3600, (eta % 3600) // 60, eta % 60))
42 |             elif eta > 60:
43 |                 eta_format = '%d:%02d' % (eta // 60, eta % 60)
44 |             else:
45 |                 eta_format = '%ds' % eta
46 |             time_info = f' - ETA: {eta_format}'
47 |         else:
48 |             if time_per_unit >= 1:
49 |                 time_info = f' {time_per_unit:.1f}s/step'
50 |             elif time_per_unit >= 1e-3:
51 |                 time_info = f' {time_per_unit * 1e3:.1f}ms/step'
52 |             else:
53 |                 time_info = f' {time_per_unit * 1e6:.1f}us/step'
54 | 
55 |         show_bar += time_info
56 |         if len(info) != 0:
57 |             show_info = f'{show_bar} ' + \
58 |                         "-".join([f' {key}: {value:.4f} ' for key, value in info.items()])
59 |             print(show_info, end='')
60 |         else:
61 |             print(show_bar, end='')
62 | 


--------------------------------------------------------------------------------
/nlp/callback/trainingmonitor.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | from typing import Dict
 3 | import numpy as np
 4 | from pathlib import Path
 5 | import matplotlib.pyplot as plt
 6 | from nlp.tools.common import load_json
 7 | from nlp.tools.common import save_json
 8 | 
 9 | plt.switch_backend('agg')
10 | 
11 | 
12 | class TrainingMonitor(object):
13 |     def __init__(self, file_dir, arch, add_test=False):
14 |         """
15 |         重新开始训练的epoch点
16 |         """
17 |         if isinstance(file_dir, Path):
18 |             pass
19 |         else:
20 |             file_dir = Path(file_dir)
21 |         file_dir.mkdir(parents=True, exist_ok=True)
22 | 
23 |         self.arch = arch
24 |         self.file_dir = file_dir
25 |         self.H = {}
26 |         self.add_test = add_test
27 |         self.json_path = file_dir / (arch + "_training_monitor.json")
28 |         self.paths = {}
29 | 
30 |     def reset(self, start_at):
31 |         if start_at > 0:
32 |             if self.json_path is not None:
33 |                 if self.json_path.exists():
34 |                     self.H = load_json(self.json_path)
35 |                     for k in self.H.keys():
36 |                         self.H[k] = self.H[k][:start_at]
37 | 
38 |     def epoch_step(self, logs: Dict):
39 |         for (k, v) in logs.items():
40 |             alist = self.H.get(k, [])
41 |             # np.float32会报错
42 |             if not isinstance(v, np.float):
43 |                 v = round(float(v), 4)
44 |             alist.append(v)
45 |             self.H[k] = alist
46 | 
47 |         # 写入文件
48 |         if self.json_path is not None:
49 |             save_json(data=self.H, file_path=self.json_path)
50 | 
51 |         # 保存train图像
52 |         if len(self.H["loss"]) == 1:
53 |             self.paths = {key: self.file_dir / (self.arch + f'_{key.upper()}') for key in self.H.keys()}
54 | 
55 |         if len(self.H["loss"]) > 1:
56 |             # 指标变化
57 |             # 曲线
58 |             # 需要成对出现
59 |             keys = [key for key, _ in self.H.items() if '_' not in key]
60 |             for key in keys:
61 |                 array = np.arange(0, len(self.H[key]))
62 |                 plt.style.use("ggplot")
63 |                 plt.figure()
64 |                 plt.plot(array, self.H[key], label=f"train_{key}")
65 |                 plt.plot(array, self.H[f"valid_{key}"], label=f"valid_{key}")
66 |                 if self.add_test:
67 |                     plt.plot(array, self.H[f"test_{key}"], label=f"test_{key}")
68 |                 plt.legend()
69 |                 plt.xlabel("Epoch #")
70 |                 plt.ylabel(key)
71 |                 plt.title(f"Training {key} [Epoch {len(self.H[key])}]")
72 |                 plt.savefig(str(self.paths[key]))
73 |                 plt.close()
74 | 


--------------------------------------------------------------------------------
/nlp/event_extractor/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__.py
 6 |     Author: czh
 7 |     Create Date: 2021/9/23
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__.py
 6 |     Author: czh
 7 |     Create Date: 2021/11/15
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__
 6 |     Author: czh
 7 |     Create Date: 2021/8/12
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__
 6 |     Author: czh
 7 |     Create Date: 2021/9/10
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/metrics/sematic_match_metric.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/6/30 10:54
 7 | """
 8 | import numpy as np
 9 | import scipy.stats as sta
10 | 
11 | 
12 | def l2_normalize(vecs):
13 |     """标准化
14 |     """
15 |     norms = (vecs**2).sum(axis=1, keepdims=True)**0.5
16 |     return vecs / np.clip(norms, 1e-8, np.inf)
17 | 
18 | 
19 | def compute_corrcoef(x, y):
20 |     """Spearman相关系数
21 |     """
22 |     return sta.spearmanr(x, y).correlation
23 | 
24 | 
25 | def compute_pearsonr(x, y):
26 |     # 输出:(r, p)
27 |     # r:相关系数[-1，1]之间
28 |     # p:相关系数显著性
29 |     # 所有下面的数据选第零位
30 |     return sta.pearsonr(x, y)[0]
31 | 


--------------------------------------------------------------------------------
/nlp/metrics/triplet_distance_metric.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/6/30 11:01
 7 | """
 8 | # The metric for the triplet loss
 9 | from torch.nn import functional as nnf
10 | 
11 | 
12 | def cosin(x, y):
13 |     return 1 - nnf.cosine_similarity(x, y)
14 | 
15 | 
16 | def euclidean(x, y):
17 |     return nnf.pairwise_distance(x, y, p=2)
18 | 
19 | 
20 | def manhattan(x, y):
21 |     return nnf.pairwise_distance(x, y, p=1)
22 | 


--------------------------------------------------------------------------------
/nlp/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__
 6 |     Author: czh
 7 |     Create Date: 2021/8/12
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/models/bert_for_ee.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: bert_for_ee
 6 |     Author: czh
 7 |     Create Date: 2021/9/8
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import torch.nn as nn
13 | 
14 | from transformers import BertModel, BertPreTrainedModel, BertConfig, BertTokenizer
15 | 
16 | from nlp.models.nezha import NeZhaModel, NeZhaConfig
17 | from nlp.layers.crf import CRF
18 | 
19 | 
20 | # 参考苏剑林的方法，https://github.com/bojone/lic2020_baselines/blob/master/ee.py
21 | class BertCRFForDuEE1Su(BertPreTrainedModel):
22 |     def __init__(self, config, train_config):
23 |         super(BertCRFForDuEE1Su, self).__init__(config)
24 |         if train_config.model_type == "bert":
25 |             self.bert = BertModel(config)
26 |         elif train_config.model_type == "nezha":
27 |             self.bert = NeZhaModel(config)
28 |         else:
29 |             raise ValueError("'model_type' must be 'bert' or 'nezha'")
30 | 
31 |         self.use_lstm = train_config.use_lstm
32 |         self.dropout = nn.Dropout(train_config.dropout_rate)
33 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
34 |         self.crf = CRF(num_tags=config.num_labels, batch_first=True)
35 |         if self.use_lstm:
36 |             self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size // 2,
37 |                                 num_layers=1, bidirectional=True, batch_first=True)
38 |         self.init_weights()
39 | 
40 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
41 |         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
42 |         sequence_output = outputs[0]
43 |         if self.use_lstm:
44 |             sequence_output, _ = self.lstm(sequence_output)
45 |         sequence_output = self.dropout(sequence_output)
46 |         logits = self.classifier(sequence_output)
47 |         outputs = (logits,)
48 |         if labels is not None:
49 |             loss = -1 * self.crf(emissions=logits, tags=labels, mask=attention_mask)
50 |             outputs = (loss,)+outputs
51 |         return outputs  # (loss), scores
52 | 
53 | 
54 | MODEL_TYPE_CLASSES = {
55 |     "bert": (BertConfig, BertTokenizer, BertCRFForDuEE1Su),
56 |     "nezha": (NeZhaConfig, BertTokenizer, BertCRFForDuEE1Su)
57 | }
58 | 


--------------------------------------------------------------------------------
/nlp/models/idcnn_for_crf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: idcnn_for_crf
 6 |     Author: czh
 7 |     Create Date: 2022/2/22
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | import numpy as np
13 | import torch
14 | import torch.nn as nn
15 | from nlp.layers.cnn import IDCNN
16 | from nlp.layers.crf import CRF
17 | from torch.nn import functional as func
18 | 
19 | 
20 | class IDCNNForCRF(nn.Module):
21 |     def __init__(self,
22 |                  vocab_size,
23 |                  word_embedding_dim,
24 |                  word2id,
25 |                  num_tag,
26 |                  embedding_file=None,
27 |                  dropout_rate=0.5,
28 |                  nil=True):
29 |         super(IDCNNForCRF, self).__init__()
30 |         self.embedding = nn.Embedding(vocab_size, word_embedding_dim)
31 |         self.embedding_file = embedding_file
32 | 
33 |         self.embedding.weight.data.copy_(
34 |             torch.from_numpy(
35 |                 self.get_embedding(vocab_size,
36 |                                    word_embedding_dim,
37 |                                    word2id,
38 |                                    nil)))
39 |         self.idcnn = IDCNN(input_size=word_embedding_dim, filters=64)
40 |         self.linear = nn.Linear(64, 256)
41 |         self.out = nn.Linear(256, num_tag)
42 | 
43 |         self.crf = CRF(num_tags=num_tag)
44 |         self.dropout_layer = nn.Dropout(dropout_rate)
45 | 
46 |     def forward(self, inputs, length, labels=None):
47 |         embeddings = self.embedding(inputs)
48 |         embeddings = self.dropout_layer(embeddings)
49 |         out = self.idcnn(embeddings, length)
50 |         out = self.linear(out)
51 |         out = self.out(out)
52 |         logits = func.dropout(out, p=0.1, training=self.training)
53 |         output = {'logits': logits}
54 |         if labels is not None:
55 |             loss = -1 * self.crf(emissions=logits, tags=labels)
56 |             output["loss"] = loss
57 |         return output
58 | 
59 |     def parse_word_vector(self, word_index, embedding_dim):
60 |         pre_trained_wordvector = {}
61 |         f = open(self.embedding_file, encoding='utf-8')
62 |         fr = f.readlines()
63 |         for line in fr[1:]:
64 |             lines = line.strip().split(' ')
65 |             word = lines[0]
66 |             if len(word) == 1:
67 |                 if word_index.get(word) is not None:
68 |                     vector = [float(f) for f in lines[1:embedding_dim + 1]]
69 |                     pre_trained_wordvector[word] = vector
70 |                 else:
71 |                     continue
72 |             else:
73 |                 continue
74 |         return pre_trained_wordvector
75 | 
76 |     def get_embedding(self, vocab_size, embedding_dim, word2id, nil=True):
77 |         print('Get embedding...')
78 |         embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)
79 |         if not nil:
80 |             pre_trained_wordector = self.parse_word_vector(word2id, embedding_dim)
81 |             for word, idx in word2id.items():
82 |                 try:
83 |                     word_vector = pre_trained_wordector[word]
84 |                     embedding_matrix[id] = word_vector
85 |                 except:
86 |                     continue
87 |         print('Get embedding done!')
88 |         return embedding_matrix
89 | 


--------------------------------------------------------------------------------
/nlp/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__
 6 |     Author: czh
 7 |     Create Date: 2021/8/17
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/processors/predict_process.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: predict_process
 6 |     Author: czh
 7 |     Create Date: 2022/2/9
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | # 处理预测结果，例如提取实体和关系
13 | 
14 | from typing import Dict, List
15 | 
16 | import torch
17 | import numpy as np
18 | 
19 | 
20 | def global_pointer_entity_extract(pred_logits: torch.Tensor,
21 |                                   id2entity: Dict[int, str],
22 |                                   entity_type_names: dict) -> List[List[dict]]:
23 |     batch_size = pred_logits.size(0)
24 |     pred_logits = pred_logits.cpu().numpy()
25 | 
26 |     pred_list = [[] for i in range(batch_size)]
27 |     for bs, label_id, start, end in zip(*np.where(pred_logits > 0)):
28 |         label = id2entity[label_id]
29 |         label_name = entity_type_names[label]
30 |         res = {'label': label, 'label_name': label_name, 'start': start, 'end': end}
31 |         pred_list[bs].append(res)
32 | 
33 |     return pred_list
34 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/LoggingHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tqdm
 3 | 
 4 | class LoggingHandler(logging.Handler):
 5 |     def __init__(self, level=logging.NOTSET):
 6 |         super().__init__(level)
 7 | 
 8 |     def emit(self, record):
 9 |         try:
10 |             msg = self.format(record)
11 |             tqdm.tqdm.write(msg)
12 |             self.flush()
13 |         except (KeyboardInterrupt, SystemExit):
14 |             raise
15 |         except:
16 |             self.handleError(record)


--------------------------------------------------------------------------------
/nlp/sentence_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.3.9"
2 | __DOWNLOAD_SERVER__ = 'https://sbert.net/models/'
3 | from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset
4 | from .LoggingHandler import LoggingHandler
5 | from .SentenceTransformer import SentenceTransformer
6 | from .readers import InputExample
7 | from .cross_encoder.CrossEncoder import CrossEncoder
8 | 
9 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/cross_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .CrossEncoder import CrossEncoder


--------------------------------------------------------------------------------
/nlp/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from sklearn.metrics import average_precision_score
 3 | from typing import List
 4 | import numpy as np
 5 | import os
 6 | import csv
 7 | 
 8 | from ... import InputExample
 9 | from ...evaluation import BinaryClassificationEvaluator
10 | 
11 | class CEBinaryClassificationEvaluator:
12 |     """
13 |     This evaluator can be used with the CrossEncoder class. Given sentence pairs and binary labels (0 and 1),
14 |     it compute the average precision and the best possible f1 score
15 |     """
16 |     def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str=''):
17 |         assert len(sentence_pairs) == len(labels)
18 |         for label in labels:
19 |             assert (label == 0 or label == 1)
20 | 
21 |         self.sentence_pairs = sentence_pairs
22 |         self.labels = np.asarray(labels)
23 |         self.name = name
24 | 
25 |         self.csv_file = "CEBinaryClassificationEvaluator" + ("_" + name if name else '') + "_results.csv"
26 |         self.csv_headers = ["epoch", "steps", "Accuracy", "Accuracy_Threshold", "F1", "F1_Threshold", "Precision", "Recall", "Average_Precision"]
27 | 
28 |     @classmethod
29 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
30 |         sentence_pairs = []
31 |         labels = []
32 | 
33 |         for example in examples:
34 |             sentence_pairs.append(example.texts)
35 |             labels.append(example.label)
36 |         return cls(sentence_pairs, labels, **kwargs)
37 | 
38 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
39 |         if epoch != -1:
40 |             if steps == -1:
41 |                 out_txt = " after epoch {}:".format(epoch)
42 |             else:
43 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
44 |         else:
45 |             out_txt = ":"
46 | 
47 |         logging.info("CEBinaryClassificationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
48 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
49 | 
50 |         acc, acc_threshold = BinaryClassificationEvaluator.find_best_acc_and_threshold(pred_scores, self.labels, True)
51 |         f1, precision, recall, f1_threshold = BinaryClassificationEvaluator.find_best_f1_and_threshold(pred_scores, self.labels, True)
52 |         ap = average_precision_score(self.labels, pred_scores)
53 | 
54 |         logging.info("Accuracy:           {:.2f}\t(Threshold: {:.4f})".format(acc * 100, acc_threshold))
55 |         logging.info("F1:                 {:.2f}\t(Threshold: {:.4f})".format(f1 * 100, f1_threshold))
56 |         logging.info("Precision:          {:.2f}".format(precision * 100))
57 |         logging.info("Recall:             {:.2f}".format(recall * 100))
58 |         logging.info("Average Precision:  {:.2f}\n".format(ap * 100))
59 | 
60 |         if output_path is not None:
61 |             csv_path = os.path.join(output_path, self.csv_file)
62 |             output_file_exists = os.path.isfile(csv_path)
63 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
64 |                 writer = csv.writer(f)
65 |                 if not output_file_exists:
66 |                     writer.writerow(self.csv_headers)
67 | 
68 |                 writer.writerow([epoch, steps, acc, acc_threshold, f1, f1_threshold, precision, recall, ap])
69 | 
70 | 
71 |         return ap


--------------------------------------------------------------------------------
/nlp/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from scipy.stats import pearsonr, spearmanr
 3 | from typing import List
 4 | import os
 5 | import csv
 6 | from ... import InputExample
 7 | 
 8 | class CECorrelationEvaluator:
 9 |     """
10 |     This evaluator can be used with the CrossEncoder class. Given sentence pairs and continuous scores,
11 |     it compute the pearson & spearman correlation between the predicted score for the sentence pair
12 |     and the gold score.
13 |     """
14 |     def __init__(self, sentence_pairs: List[List[str]], scores: List[float], name: str=''):
15 |         self.sentence_pairs = sentence_pairs
16 |         self.scores = scores
17 |         self.name = name
18 | 
19 |         self.csv_file = "CECorrelationEvaluator" + ("_" + name if name else '') + "_results.csv"
20 |         self.csv_headers = ["epoch", "steps", "Pearson_Correlation", "Spearman_Correlation"]
21 | 
22 |     @classmethod
23 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
24 |         sentence_pairs = []
25 |         scores = []
26 | 
27 |         for example in examples:
28 |             sentence_pairs.append(example.texts)
29 |             scores.append(example.label)
30 |         return cls(sentence_pairs, scores, **kwargs)
31 | 
32 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
33 |         if epoch != -1:
34 |             if steps == -1:
35 |                 out_txt = " after epoch {}:".format(epoch)
36 |             else:
37 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
38 |         else:
39 |             out_txt = ":"
40 | 
41 |         logging.info("CECorrelationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
42 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
43 | 
44 | 
45 |         eval_pearson, _ = pearsonr(self.scores, pred_scores)
46 |         eval_spearman, _ = spearmanr(self.scores, pred_scores)
47 | 
48 |         logging.info("Correlation:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson, eval_spearman))
49 | 
50 |         if output_path is not None:
51 |             csv_path = os.path.join(output_path, self.csv_file)
52 |             output_file_exists = os.path.isfile(csv_path)
53 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
54 |                 writer = csv.writer(f)
55 |                 if not output_file_exists:
56 |                     writer.writerow(self.csv_headers)
57 | 
58 |                 writer.writerow([epoch, steps, eval_pearson, eval_spearman])
59 | 
60 |         return eval_spearman


--------------------------------------------------------------------------------
/nlp/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import csv
 4 | from typing import List
 5 | from ... import InputExample
 6 | import numpy as np
 7 | 
 8 | class CESoftmaxAccuracyEvaluator:
 9 |     """
10 |     This evaluator can be used with the CrossEncoder class.
11 | 
12 |     It is designed for CrossEncoders with 2 or more outputs. It measure the
13 |     accuracy of the predict class vs. the gold labels.
14 |     """
15 |     def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str=''):
16 |         self.sentence_pairs = sentence_pairs
17 |         self.labels = labels
18 |         self.name = name
19 | 
20 |         self.csv_file = "CESoftmaxAccuracyEvaluator" + ("_" + name if name else '') + "_results.csv"
21 |         self.csv_headers = ["epoch", "steps", "Accuracy"]
22 | 
23 |     @classmethod
24 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
25 |         sentence_pairs = []
26 |         labels = []
27 | 
28 |         for example in examples:
29 |             sentence_pairs.append(example.texts)
30 |             labels.append(example.label)
31 |         return cls(sentence_pairs, labels, **kwargs)
32 | 
33 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
34 |         if epoch != -1:
35 |             if steps == -1:
36 |                 out_txt = " after epoch {}:".format(epoch)
37 |             else:
38 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
39 |         else:
40 |             out_txt = ":"
41 | 
42 |         logging.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
43 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
44 |         pred_labels = np.argmax(pred_scores, axis=1)
45 | 
46 |         assert len(pred_labels) == len(self.labels)
47 | 
48 |         acc = np.sum(pred_labels == self.labels) / len(self.labels)
49 | 
50 |         logging.info("Accuracy: {:.2f}".format(acc*100))
51 | 
52 |         if output_path is not None:
53 |             csv_path = os.path.join(output_path, self.csv_file)
54 |             output_file_exists = os.path.isfile(csv_path)
55 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
56 |                 writer = csv.writer(f)
57 |                 if not output_file_exists:
58 |                     writer.writerow(self.csv_headers)
59 | 
60 |                 writer.writerow([epoch, steps, acc])
61 | 
62 |         return acc


--------------------------------------------------------------------------------
/nlp/sentence_transformers/cross_encoder/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator
2 | from .CECorrelationEvaluator import CECorrelationEvaluator
3 | from .CESoftmaxAccuracyEvaluator import CESoftmaxAccuracyEvaluator
4 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/datasets/EncodeDataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from typing import List, Union
 3 | from .. import SentenceTransformer
 4 | 
 5 | 
 6 | class EncodeDataset(Dataset):
 7 |     def __init__(self,
 8 |                  sentences: Union[List[str], List[int]],
 9 |                  model: SentenceTransformer,
10 |                  is_tokenized: bool = True):
11 |         """
12 |         EncodeDataset is used by SentenceTransformer.encode method. It just stores
13 |         the input texts and returns a tokenized version of it.
14 |         """
15 |         self.model = model
16 |         self.sentences = sentences
17 |         self.is_tokenized = is_tokenized
18 | 
19 | 
20 |     def __getitem__(self, item):
21 |         return self.sentences[item] if self.is_tokenized else self.model.tokenize(self.sentences[item])
22 | 
23 | 
24 |     def __len__(self):
25 |         return len(self.sentences)
26 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/datasets/SentencesDataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from typing import List
 3 | import torch
 4 | from .. import SentenceTransformer
 5 | from ..readers.InputExample import InputExample
 6 | 
 7 | class SentencesDataset(Dataset):
 8 |     """
 9 |     Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all
10 |     sequences to the max length.
11 |     The SentenceBertEncoder.smart_batching_collate is required for this to work.
12 |     SmartBatchingDataset does *not* work without it.
13 |     """
14 |     def __init__(self,
15 |                  examples: List[InputExample],
16 |                  model: SentenceTransformer
17 |                  ):
18 |         """
19 |         Create a new SentencesDataset with the tokenized texts and the labels as Tensor
20 | 
21 |         :param examples
22 |             A list of sentence.transformers.readers.InputExample
23 |         :param model:
24 |             SentenceTransformerModel
25 |         """
26 |         self.model = model
27 |         self.examples = examples
28 |         self.label_type = torch.long if isinstance(self.examples[0].label, int) else torch.float
29 | 
30 | 
31 |     def __getitem__(self, item):
32 |         label = torch.tensor(self.examples[item].label, dtype=self.label_type)
33 |         if self.examples[item].texts_tokenized is None:
34 |             self.examples[item].texts_tokenized = [self.model.tokenize(text) for text in self.examples[item].texts]
35 | 
36 |         return self.examples[item].texts_tokenized, label
37 | 
38 | 
39 |     def __len__(self):
40 |         return len(self.examples)
41 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampler import *
2 | from .ParallelSentencesDataset import ParallelSentencesDataset
3 | from .SentenceLabelDataset import SentenceLabelDataset
4 | from .SentencesDataset import SentencesDataset
5 | from .EncodeDataset import EncodeDataset
6 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/datasets/sampler/LabelSampler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties.
 3 | """
 4 | from torch.utils.data import Sampler
 5 | import numpy as np
 6 | from ...datasets import SentenceLabelDataset
 7 | 
 8 | 
 9 | class LabelSampler(Sampler):
10 |     """
11 |     This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS
12 |     or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch.
13 | 
14 |     It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
15 | 
16 |     Labels with fewer than n unique samples are ignored.
17 |     This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
18 | 
19 |     This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
20 |     by the samples drawn per label.
21 | 
22 | 
23 |     """
24 |     def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5,
25 |                  with_replacement: bool = False):
26 |         """
27 |         Creates a LabelSampler for a SentenceLabelDataset.
28 | 
29 |         :param data_source:
30 |             the dataset from which samples are drawn
31 |         :param samples_per_label:
32 |             the number of consecutive, random and unique samples drawn per label
33 |         :param with_replacement:
34 |             if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
35 |             if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
36 |             drawing.
37 |         """
38 |         super().__init__(data_source)
39 |         self.data_source = data_source
40 |         self.samples_per_label = samples_per_label
41 |         self.label_range = np.arange(data_source.num_labels)
42 |         self.borders = data_source.groups_right_border
43 |         self.with_replacement = with_replacement
44 |         np.random.shuffle(self.label_range)
45 | 
46 |     def __iter__(self):
47 |         label_idx = 0
48 |         count = 0
49 |         already_seen = {}
50 |         while count < len(self.data_source):
51 |             label = self.label_range[label_idx]
52 |             if label not in already_seen:
53 |                 already_seen[label] = set()
54 | 
55 |             left_border = 0 if label == 0 else self.borders[label-1]
56 |             right_border = self.borders[label]
57 | 
58 |             if self.with_replacement:
59 |                 selection = np.arange(left_border, right_border)
60 |             else:
61 |                 selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
62 | 
63 |             if len(selection) >= self.samples_per_label:
64 |                 for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
65 |                     count += 1
66 |                     already_seen[label].add(element_idx)
67 |                     yield element_idx
68 | 
69 |             label_idx += 1
70 |             if label_idx >= len(self.label_range):
71 |                 label_idx = 0
72 |                 already_seen = {}
73 |                 np.random.shuffle(self.label_range)
74 | 
75 |     def __len__(self):
76 |         return len(self.data_source)


--------------------------------------------------------------------------------
/nlp/sentence_transformers/datasets/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from .LabelSampler import *


--------------------------------------------------------------------------------
/nlp/sentence_transformers/evaluation/LabelAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | import logging
 5 | from tqdm import tqdm
 6 | from ..util import batch_to_device
 7 | import os
 8 | import csv
 9 | 
10 | class LabelAccuracyEvaluator(SentenceEvaluator):
11 |     """
12 |     Evaluate a model based on its accuracy on a labeled dataset
13 | 
14 |     This requires a model with LossFunction.SOFTMAX
15 | 
16 |     The results are written in a CSV. If a CSV already exists, then values are appended.
17 |     """
18 | 
19 |     def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None):
20 |         """
21 |         Constructs an evaluator for the given dataset
22 | 
23 |         :param dataloader:
24 |             the data for the evaluation
25 |         """
26 |         self.dataloader = dataloader
27 |         self.name = name
28 |         self.softmax_model = softmax_model
29 | 
30 |         if name:
31 |             name = "_"+name
32 | 
33 |         self.csv_file = "accuracy_evaluation"+name+"_results.csv"
34 |         self.csv_headers = ["epoch", "steps", "accuracy"]
35 | 
36 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
37 |         model.eval()
38 |         total = 0
39 |         correct = 0
40 | 
41 |         if epoch != -1:
42 |             if steps == -1:
43 |                 out_txt = " after epoch {}:".format(epoch)
44 |             else:
45 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
46 |         else:
47 |             out_txt = ":"
48 | 
49 |         logging.info("Evaluation on the "+self.name+" dataset"+out_txt)
50 |         self.dataloader.collate_fn = model.smart_batching_collate
51 |         for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
52 |             features, label_ids = batch_to_device(batch, model.device)
53 |             with torch.no_grad():
54 |                 _, prediction = self.softmax_model(features, labels=None)
55 | 
56 |             total += prediction.size(0)
57 |             correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
58 |         accuracy = correct/total
59 | 
60 |         logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
61 | 
62 |         if output_path is not None:
63 |             csv_path = os.path.join(output_path, self.csv_file)
64 |             if not os.path.isfile(csv_path):
65 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
66 |                     writer = csv.writer(f)
67 |                     writer.writerow(self.csv_headers)
68 |                     writer.writerow([epoch, steps, accuracy])
69 |             else:
70 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
71 |                     writer = csv.writer(f)
72 |                     writer.writerow([epoch, steps, accuracy])
73 | 
74 |         return accuracy
75 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/evaluation/MSEEvaluator.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers.evaluation import SentenceEvaluator
 2 | import numpy as np
 3 | import logging
 4 | import os
 5 | import csv
 6 | from typing import List
 7 | 
 8 | class MSEEvaluator(SentenceEvaluator):
 9 |     """
10 |     Computes the mean squared error (x100) between the computed sentence embedding
11 |     and some target sentence embedding.
12 | 
13 |     The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
14 | 
15 |     For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
16 |     and target_sentences are in a different language like German, Chinese, Spanish...
17 | 
18 |     :param source_sentences: Source sentences are embedded with the teacher model
19 |     :param target_sentences: Target sentences are ambedding with the student model.
20 |     :param show_progress_bar: Show progress bar when computing embeddings
21 |     :param batch_size: Batch size to compute sentence embeddings
22 |     :param name: Name of the evaluator
23 |     """
24 |     def __init__(self, source_sentences: List[str], target_sentences: List[str], teacher_model = None, show_progress_bar: bool = False, batch_size: int = 32, name: str = ''):
25 |         self.source_embeddings = teacher_model.encode(source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True)
26 | 
27 |         self.target_sentences = target_sentences
28 |         self.show_progress_bar = show_progress_bar
29 |         self.batch_size = batch_size
30 |         self.name = name
31 | 
32 |         self.csv_file = "mse_evaluation_" + name + "_results.csv"
33 |         self.csv_headers = ["epoch", "steps", "MSE"]
34 | 
35 |     def __call__(self, model, output_path, epoch  = -1, steps = -1):
36 |         if epoch != -1:
37 |             if steps == -1:
38 |                 out_txt = " after epoch {}:".format(epoch)
39 |             else:
40 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
41 |         else:
42 |             out_txt = ":"
43 | 
44 |         target_embeddings = model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=True)
45 | 
46 |         mse = ((self.source_embeddings - target_embeddings)**2).mean()
47 |         mse *= 100
48 | 
49 |         logging.info("MSE evaluation (lower = better) on "+self.name+" dataset"+out_txt)
50 |         logging.info("MSE (*100):\t{:4f}".format(mse))
51 | 
52 |         if output_path is not None:
53 |             csv_path = os.path.join(output_path, self.csv_file)
54 |             output_file_exists = os.path.isfile(csv_path)
55 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
56 |                 writer = csv.writer(f)
57 |                 if not output_file_exists:
58 |                     writer.writerow(self.csv_headers)
59 | 
60 |                 writer.writerow([epoch, steps, mse])
61 | 
62 |         return -mse #Return negative score as SentenceTransformers maximizes the performance


--------------------------------------------------------------------------------
/nlp/sentence_transformers/evaluation/SentenceEvaluator.py:
--------------------------------------------------------------------------------
 1 | class SentenceEvaluator:
 2 |     """
 3 |     Base class for all evaluators
 4 | 
 5 |     Extend this class and implement __call__ for custom evaluators.
 6 |     """
 7 | 
 8 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 9 |         """
10 |         This is called during training to evaluate the model.
11 |         It returns a score for the evaluation with a higher score indicating a better result.
12 | 
13 |         :param model:
14 |             the model to evaluate
15 |         :param output_path:
16 |             path where predictions and metrics are written to
17 |         :param epoch
18 |             the epoch where the evaluation takes place.
19 |             This is used for the file prefixes.
20 |             If this is -1, then we assume evaluation on test data.
21 |         :param steps
22 |             the steps in the current epoch at time of the evaluation.
23 |             This is used for the file prefixes.
24 |             If this is -1, then we assume evaluation at the end of the epoch.
25 |         :return: a score for the evaluation with a higher score indicating a better result
26 |         """
27 |         pass
28 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/evaluation/SequentialEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | from typing import Iterable
 3 | 
 4 | class SequentialEvaluator(SentenceEvaluator):
 5 |     """
 6 |     This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
 7 |     the data is passed sequentially to all sub-evaluators.
 8 | 
 9 |     All scores are passed to 'main_score_function', which derives one final score value
10 |     """
11 |     def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]):
12 |         self.evaluators = evaluators
13 |         self.main_score_function = main_score_function
14 | 
15 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
16 |         scores = []
17 |         for evaluator in self.evaluators:
18 |             scores.append(evaluator(model, output_path, epoch, steps))
19 | 
20 |         return self.main_score_function(scores)
21 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/evaluation/SimilarityFunction.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class SimilarityFunction(Enum):
4 |     COSINE = 0
5 |     EUCLIDEAN = 1
6 |     MANHATTAN = 2
7 |     DOT_PRODUCT = 3
8 | 
9 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .SentenceEvaluator import SentenceEvaluator
 2 | from .SimilarityFunction import SimilarityFunction
 3 | 
 4 | from .BinaryClassificationEvaluator import BinaryClassificationEvaluator
 5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
 6 | 
 7 | from .InformationRetrievalEvaluator import InformationRetrievalEvaluator
 8 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
 9 | from .MSEEvaluator import MSEEvaluator
10 | from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame
11 | from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator
12 | from .SequentialEvaluator import SequentialEvaluator
13 | from .TranslationEvaluator import TranslationEvaluator
14 | from .TripletEvaluator import TripletEvaluator
15 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/losses/ContrastiveLoss.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Iterable, Dict
 3 | 
 4 | import torch.nn.functional as F
 5 | from torch import nn, Tensor
 6 | 
 7 | from sentence_transformers.SentenceTransformer import SentenceTransformer
 8 | 
 9 | 
10 | class SiameseDistanceMetric(Enum):
11 |     """
12 |     The metric for the contrastive loss
13 |     """
14 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
15 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
16 |     COSINE_DISTANCE = lambda x, y: 1-F.cosine_similarity(x, y)
17 | 
18 | 
19 | class ContrastiveLoss(nn.Module):
20 |     """
21 |     Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the
22 |     two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.
23 | 
24 |     Further information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
25 | 
26 |     :param model: SentenceTransformer model
27 |     :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
28 |     :param margin: Negative samples (label == 0) should have a distance of at least the margin value.
29 |     :param size_average: Average by the size of the mini-batch.
30 | 
31 |     Example::
32 | 
33 |         from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
34 |         from sentence_transformers.readers import InputExample
35 | 
36 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
37 |         train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
38 |             InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]
39 |         train_dataset = SentencesDataset(train_examples, model)
40 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
41 |         train_loss = losses.ContrastiveLoss(model=model)
42 | 
43 |     """
44 | 
45 |     def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5, size_average:bool = True):
46 |         super(ContrastiveLoss, self).__init__()
47 |         self.distance_metric = distance_metric
48 |         self.margin = margin
49 |         self.model = model
50 |         self.size_average = size_average
51 | 
52 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
53 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
54 |         assert len(reps) == 2
55 |         rep_anchor, rep_other = reps
56 |         distances = self.distance_metric(rep_anchor, rep_other)
57 |         losses = 0.5 * (labels.float() * distances.pow(2) + (1 - labels).float() * F.relu(self.margin - distances).pow(2))
58 |         return losses.mean() if self.size_average else losses.sum()
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/losses/CosineSimilarityLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | 
 6 | 
 7 | class CosineSimilarityLoss(nn.Module):
 8 |     """
 9 |     CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label.
10 | 
11 |     It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two.
12 |     By default, it minimizes the following loss: ||input_label - cos_score_transformation(cosine_sim(u,v))||_2.
13 | 
14 |     :param model: SentenceTranformer model
15 |     :param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE:  ||input_label - cosine_sim(u,v)||_2
16 |     :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity. By default, the identify function is used (i.e. no change).
17 | 
18 |     Example::
19 | 
20 |             from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
21 | 
22 |             model = SentenceTransformer('distilbert-base-nli-mean-tokens')
23 |             train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
24 |                 InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]
25 |             train_dataset = SentencesDataset(train_examples, model)
26 |             train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
27 |             train_loss = losses.CosineSimilarityLoss(model=model)
28 | 
29 | 
30 |     """
31 |     def __init__(self, model: SentenceTransformer, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()):
32 |         super(CosineSimilarityLoss, self).__init__()
33 |         self.model = model
34 |         self.loss_fct = loss_fct
35 |         self.cos_score_transformation = cos_score_transformation
36 | 
37 | 
38 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
39 |         embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
40 |         output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
41 |         return self.loss_fct(output, labels.view(-1))
42 | 
43 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/losses/MSELoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | 
 5 | 
 6 | class MSELoss(nn.Module):
 7 |     """
 8 |     Computes the MSE loss between the computed sentence embedding and a target sentence embedding. This loss
 9 |     is used when extending sentence embeddings to new languages as described in our publication
10 |     Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation: https://arxiv.org/abs/2004.09813
11 | 
12 |     For an example, see the documentation on extending language models to new languages.
13 |     """
14 |     def __init__(self, model):
15 |         super(MSELoss, self).__init__()
16 |         self.model = model
17 |         self.loss_fct = nn.MSELoss()
18 | 
19 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
20 |         rep = self.model(sentence_features[0])['sentence_embedding']
21 |         return self.loss_fct(rep, labels)
22 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/losses/OnlineContrastiveLoss.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import torch.nn.functional as F
 3 | from torch import nn, Tensor
 4 | from .ContrastiveLoss import SiameseDistanceMetric
 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer
 6 | 
 7 | 
 8 | class OnlineContrastiveLoss(nn.Module):
 9 |     """
10 |     Online Contrastive loss. Similar to ConstrativeLoss, but it selects hard positive (positives that are far apart)
11 |      and hard negative pairs (negatives that are close) and computes the loss only for these pairs. Often yields
12 |      better performances than  ConstrativeLoss.
13 | 
14 |     :param model: SentenceTransformer model
15 |     :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
16 |     :param margin: Negative samples (label == 0) should have a distance of at least the margin value.
17 |     :param size_average: Average by the size of the mini-batch.
18 | 
19 |     Example::
20 | 
21 |         from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
22 |         from sentence_transformers.readers import InputExample
23 | 
24 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
25 |         train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
26 |             InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]
27 |         train_dataset = SentencesDataset(train_examples, model)
28 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
29 |         train_loss = losses.OnlineContrastiveLoss(model=model)
30 |     """
31 | 
32 |     def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5):
33 |         super(OnlineContrastiveLoss, self).__init__()
34 |         self.model = model
35 |         self.margin = margin
36 |         self.distance_metric = distance_metric
37 | 
38 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor, size_average=False):
39 |         embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
40 | 
41 |         distance_matrix = self.distance_metric(embeddings[0], embeddings[1])
42 |         negs = distance_matrix[labels == 0]
43 |         poss = distance_matrix[labels == 1]
44 | 
45 |         # select hard positive and hard negative pairs
46 |         negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
47 |         positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
48 | 
49 |         positive_loss = positive_pairs.pow(2).sum()
50 |         negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
51 |         loss = positive_loss + negative_loss
52 |         return loss


--------------------------------------------------------------------------------
/nlp/sentence_transformers/losses/TripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from enum import Enum
 6 | from ..SentenceTransformer import SentenceTransformer
 7 | 
 8 | class TripletDistanceMetric(Enum):
 9 |     """
10 |     The metric for the triplet loss
11 |     """
12 |     COSINE = lambda x, y: 1 - F.cosine_similarity(x, y)
13 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
14 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
15 | 
16 | class TripletLoss(nn.Module):
17 |     """
18 |     This class implements triplet loss. Given a triplet of (anchor, positive, negative),
19 |     the loss minimizes the distance between anchor and positive while it maximizes the distance
20 |     between anchor and negative. It compute the following loss function:
21 | 
22 |     loss = max(||anchor - positive|| - ||anchor - negative|| + margin, 0).
23 | 
24 |     Margin is an important hyperparameter and needs to be tuned respectively.
25 | 
26 |     For further details, see: https://en.wikipedia.org/wiki/Triplet_loss
27 | 
28 |     :param model: SentenceTransformerModel
29 |     :param distance_metric: Function to compute distance between two embeddings. The class TripletDistanceMetric contains common distance metrices that can be used.
30 |     :param triplet_margin: The negative should be at least this much further away from the anchor than the positive.
31 | 
32 |     Example::
33 | 
34 |         from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
35 |         from sentence_transformers.readers import InputExample
36 | 
37 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
38 |         train_examples = [InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1']),
39 |             InputExample(texts=['Anchor 2', 'Positive 2', 'Negative 2'])]
40 |         train_dataset = SentencesDataset(train_examples, model)
41 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
42 |         train_loss = losses.TripletLoss(model=model)
43 |     """
44 |     def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin: float = 5):
45 |         super(TripletLoss, self).__init__()
46 |         self.model = model
47 |         self.distance_metric = distance_metric
48 |         self.triplet_margin = triplet_margin
49 | 
50 | 
51 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
52 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
53 | 
54 |         rep_anchor, rep_pos, rep_neg = reps
55 |         distance_pos = self.distance_metric(rep_anchor, rep_pos)
56 |         distance_neg = self.distance_metric(rep_anchor, rep_neg)
57 | 
58 |         losses = F.relu(distance_pos - distance_neg + self.triplet_margin)
59 |         return losses.mean()


--------------------------------------------------------------------------------
/nlp/sentence_transformers/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .CosineSimilarityLoss import *
 2 | from .SoftmaxLoss import *
 3 | from .AdvCLSoftmaxLoss import *
 4 | from .MultipleNegativesRankingLoss import *
 5 | from .TripletLoss import *
 6 | from .MSELoss import *
 7 | from .ContrastiveLoss import *
 8 | from .OnlineContrastiveLoss import *
 9 | from .MegaBatchMarginLoss import *
10 | 
11 | # Triplet losses
12 | from .BatchHardTripletLoss import *
13 | from .BatchHardSoftMarginTripletLoss import *
14 | from .BatchSemiHardTripletLoss import *
15 | from .BatchAllTripletLoss import *
16 | from .SimSiamLoss import *
17 | from .SimCLRLoss import *


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/ALBERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class ALBERT(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/BERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class BERT(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/BoW.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | from .tokenizer import WhitespaceTokenizer
10 | 
11 | class BoW(nn.Module):
12 |     """Implements a Bag-of-Words (BoW) model to derive sentence embeddings.
13 | 
14 |     A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
15 |     """
16 | 
17 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
18 |         super(BoW, self).__init__()
19 |         vocab = list(set(vocab)) #Ensure vocab is unique
20 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
21 |         self.vocab = vocab
22 |         self.word_weights = word_weights
23 |         self.unknown_word_weight = unknown_word_weight
24 |         self.cumulative_term_frequency = cumulative_term_frequency
25 | 
26 |         #Maps wordIdx -> word weight
27 |         self.weights = []
28 |         num_unknown_words = 0
29 |         for word in vocab:
30 |             weight = unknown_word_weight
31 |             if word in word_weights:
32 |                 weight = word_weights[word]
33 |             elif word.lower() in word_weights:
34 |                 weight = word_weights[word.lower()]
35 |             else:
36 |                 num_unknown_words += 1
37 |             self.weights.append(weight)
38 | 
39 |         logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
40 | 
41 |         self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
42 |         self.sentence_embedding_dimension = len(vocab)
43 | 
44 | 
45 |     def forward(self, features: Dict[str, Tensor]):
46 |         #Nothing to do, everything is done in get_sentence_features
47 |         return features
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         return self.tokenizer.tokenize(text)
51 | 
52 |     def get_sentence_embedding_dimension(self):
53 |         return self.sentence_embedding_dimension
54 | 
55 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
56 |         vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
57 |         for token in tokens:
58 |             if self.cumulative_term_frequency:
59 |                 vector[token] += self.weights[token]
60 |             else:
61 |                 vector[token] = self.weights[token]
62 | 
63 |         return {'sentence_embedding': torch.tensor([vector], dtype=torch.float)}
64 | 
65 |     def get_config_dict(self):
66 |         return {key: self.__dict__[key] for key in self.config_keys}
67 | 
68 |     def save(self, output_path):
69 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
70 |             json.dump(self.get_config_dict(), fOut, indent=2)
71 | 
72 |     @staticmethod
73 |     def load(input_path):
74 |         with open(os.path.join(input_path, 'config.json')) as fIn:
75 |             config = json.load(fIn)
76 | 
77 |         return BoW(**config)


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class CNN(nn.Module):
15 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
16 | 
17 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
20 |         self.in_word_embedding_dimension = in_word_embedding_dimension
21 |         self.out_channels = out_channels
22 |         self.kernel_sizes = kernel_sizes
23 | 
24 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
25 |         self.convs = nn.ModuleList()
26 | 
27 |         in_channels = in_word_embedding_dimension
28 |         for kernel_size in kernel_sizes:
29 |             padding_size = int((kernel_size - 1) / 2)
30 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
31 |                              padding=padding_size)
32 |             self.convs.append(conv)
33 | 
34 |     def forward(self, features):
35 |         token_embeddings = features['token_embeddings']
36 | 
37 |         token_embeddings = token_embeddings.transpose(1, -1)
38 |         vectors = [conv(token_embeddings) for conv in self.convs]
39 |         out = torch.cat(vectors, 1).transpose(1, -1)
40 | 
41 |         features.update({'token_embeddings': out})
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.embeddings_dimension
46 | 
47 |     def tokenize(self, text: str) -> List[int]:
48 |         raise NotImplementedError()
49 | 
50 |     def save(self, output_path: str):
51 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
52 |             json.dump(self.get_config_dict(), fOut, indent=2)
53 | 
54 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
55 | 
56 |     def get_config_dict(self):
57 |         return {key: self.__dict__[key] for key in self.config_keys}
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
65 |         model = CNN(**config)
66 |         model.load_state_dict(weights)
67 |         return model
68 | 
69 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/CamemBERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | 
 4 | class CamemBERT(Transformer):
 5 |     """
 6 |     DEPRECATED: Please use models.Transformer instead.
 7 |     """
 8 |     pass
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/Dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from torch import functional as F
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | import os
 7 | import json
 8 | from ..util import fullname, import_from_string
 9 | 
10 | 
11 | class Dense(nn.Module):
12 |     """Feed-forward function with  activiation function.
13 | 
14 |     This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN).
15 | 
16 |     :param in_features: Size of the input dimension
17 |     :param out_features: Output size
18 |     :param bias: Add a bias vector
19 |     :param activation_function: Pytorch activation function applied on output
20 |     """
21 |     def __init__(self, in_features: int, out_features: int, bias: bool = True, activation_function=nn.Tanh()):
22 |         super(Dense, self).__init__()
23 |         self.in_features = in_features
24 |         self.out_features = out_features
25 |         self.bias = bias
26 |         self.activation_function = activation_function
27 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
28 | 
29 |     def forward(self, features: Dict[str, Tensor]):
30 |         features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))})
31 |         return features
32 | 
33 |     def get_sentence_embedding_dimension(self) -> int:
34 |         return self.out_features
35 | 
36 |     def save(self, output_path):
37 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
38 |             json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut)
39 | 
40 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
41 | 
42 |     @staticmethod
43 |     def load(input_path):
44 |         with open(os.path.join(input_path, 'config.json')) as fIn:
45 |             config = json.load(fIn)
46 | 
47 |         config['activation_function'] = import_from_string(config['activation_function'])()
48 |         model = Dense(**config)
49 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
50 |         return model
51 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/DistilBERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class DistilBERT(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from typing import List
 4 | import os
 5 | import json
 6 | 
 7 | 
 8 | 
 9 | class LSTM(nn.Module):
10 |     """
11 |     Bidirectional LSTM running over word embeddings.
12 |     """
13 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True):
14 |         nn.Module.__init__(self)
15 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.hidden_dim = hidden_dim
18 |         self.num_layers = num_layers
19 |         self.dropout = dropout
20 |         self.bidirectional = bidirectional
21 | 
22 |         self.embeddings_dimension = hidden_dim
23 |         if self.bidirectional:
24 |             self.embeddings_dimension *= 2
25 | 
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
31 | 
32 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
33 |         packed = self.encoder(packed)
34 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
35 |         features.update({'token_embeddings': unpack})
36 |         return features
37 | 
38 |     def get_word_embedding_dimension(self) -> int:
39 |         return self.embeddings_dimension
40 | 
41 |     def tokenize(self, text: str) -> List[int]:
42 |         raise NotImplementedError()
43 | 
44 |     def save(self, output_path: str):
45 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
46 |             json.dump(self.get_config_dict(), fOut, indent=2)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     def get_config_dict(self):
51 |         return {key: self.__dict__[key] for key in self.config_keys}
52 | 
53 |     @staticmethod
54 |     def load(input_path: str):
55 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
56 |             config = json.load(fIn)
57 | 
58 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
59 |         model = LSTM(**config)
60 |         model.load_state_dict(weights)
61 |         return model
62 | 
63 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/MLP3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import os
 4 | import json
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | from torch import Tensor
 7 | 
 8 | class MLP3(nn.Module):
 9 |     def __init__(self, hidden_dim=2048, norm=None, activation='relu'):
10 |         super().__init__()
11 |         ''' page 3 baseline setting
12 |         Projection MLP. The projection MLP (in f) has BN ap-
13 |         plied to each fully-connected (fc) layer, including its out-
14 |         put fc. Its output fc has no ReLU. The hidden fc is 2048-d.
15 |         This MLP has 3 layers.
16 |         '''
17 |         self.config_keys = ['hidden_dim',  'norm', 'activation']
18 |         self.hidden_dim = hidden_dim
19 |         self.norm = norm
20 |         self.activation = activation
21 |         
22 |         if activation == "relu":
23 |             activation_layer = nn.ReLU()
24 |         elif activation == "leakyrelu":
25 |             activation_layer = nn.LeakyReLU()
26 |         elif activation == "tanh":
27 |             activation_layer = nn.Tanh()
28 |         elif activation == "sigmoid":
29 |             activation_layer = nn.Sigmoid()
30 |         else:
31 |             raise ValueError(f"Unknown activation function {hidden_activation}")
32 |             
33 |         if norm:
34 |             if norm=='bn':
35 |                 norm_layer = nn.BatchNorm1d
36 |             else: 
37 |                 norm_layer = nn.LayerNorm
38 | 
39 |             self.layer1 = nn.Sequential(
40 |                 nn.Linear(hidden_dim, hidden_dim),
41 |                 norm_layer(hidden_dim),
42 |                 nn.ReLU(inplace=True)
43 |             )
44 |             self.layer2 = nn.Sequential(
45 |                 nn.Linear(hidden_dim, hidden_dim),
46 |                 norm_layer(hidden_dim),
47 |                 nn.ReLU(inplace=True)
48 |             )
49 |             self.layer3 = nn.Sequential(
50 |                 nn.Linear(hidden_dim, hidden_dim),
51 |                 norm_layer(hidden_dim)
52 |             )
53 |         else:
54 |             self.layer1 = nn.Sequential(
55 |                 nn.Linear(hidden_dim, hidden_dim),
56 |                 nn.ReLU(inplace=True)
57 |             )
58 |             self.layer2 = nn.Sequential(
59 |                 nn.Linear(hidden_dim, hidden_dim),
60 |                 nn.ReLU(inplace=True)
61 |             )
62 |             self.layer3 = nn.Sequential(
63 |                 nn.Linear(hidden_dim, hidden_dim),
64 |             )
65 |            
66 |         self.num_layers = 3
67 | 
68 |     def set_layers(self, num_layers):
69 |         self.num_layers = num_layers
70 |  
71 |     def forward(self, features: Dict[str, Tensor]):
72 |         x = features["token_embeddings"]
73 |         if self.num_layers == 3:
74 |             x = self.layer1(x)
75 |             x = self.layer2(x)
76 |             x = self.layer3(x)
77 |         elif self.num_layers == 2:
78 |             x = self.layer1(x)
79 |             x = self.layer3(x)
80 |         else:
81 |             raise Exception
82 |         features["token_embeddings"] = x
83 |         return features
84 |     
85 |     def get_config_dict(self):
86 |         return {key: self.__dict__[key] for key in self.config_keys}
87 |     
88 |     def save(self, output_path):
89 |         with open(os.path.join(output_path, 'mlp3_config.json'), 'w') as fOut:
90 |             json.dump(self.get_config_dict(), fOut, indent=2)
91 | 
92 |     @staticmethod
93 |     def load(input_path):
94 |         with open(os.path.join(input_path, 'mlp3_config.json')) as fIn:
95 |             config = json.load(fIn)
96 | 
97 |         return MLP3(**config)


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/Normalize.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from typing import Dict
 4 | import torch.nn.functional as F
 5 | 
 6 | class Normalize(nn.Module):
 7 |     """
 8 |     This layer normalizes embeddings to unit length
 9 |     """
10 |     def __init__(self):
11 |         super(Normalize, self).__init__()
12 | 
13 |     def forward(self, features: Dict[str, Tensor]):
14 |         features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=1)})
15 |         return features
16 | 
17 |     def save(self, output_path):
18 |         pass
19 | 
20 |     @staticmethod
21 |     def load(input_path):
22 |         return Normalize()
23 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/RoBERTa.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class RoBERTa(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/WeightedLayerPooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class WeightedLayerPooling(nn.Module):
10 |     """
11 |     Token embeddings are weighted mean of their different hidden layer representations
12 |     """
13 |     def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None):
14 |         super(WeightedLayerPooling, self).__init__()
15 |         self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.layer_start = layer_start
18 |         self.num_hidden_layers = num_hidden_layers
19 |         self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
20 | 
21 |     def forward(self, features: Dict[str, Tensor]):
22 |         ft_all_layers = features['all_layer_embeddings']
23 | 
24 |         all_layer_embedding = torch.stack(ft_all_layers)
25 |         all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]  # Start from 4th layers output
26 | 
27 |         weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
28 |         weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
29 | 
30 |         features.update({'token_embeddings': weighted_average})
31 |         return features
32 | 
33 |     def get_word_embedding_dimension(self):
34 |         return self.word_embedding_dimension
35 | 
36 |     def get_config_dict(self):
37 |         return {key: self.__dict__[key] for key in self.config_keys}
38 | 
39 |     def save(self, output_path):
40 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
41 |             json.dump(self.get_config_dict(), fOut, indent=2)
42 | 
43 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
44 | 
45 | 
46 |     @staticmethod
47 |     def load(input_path):
48 |         with open(os.path.join(input_path, 'config.json')) as fIn:
49 |             config = json.load(fIn)
50 | 
51 |         model = WeightedLayerPooling(**config)
52 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
53 |         return model
54 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/WordWeights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | 
 9 | class WordWeights(nn.Module):
10 |     """This model can weight word embeddings, for example, with idf-values."""
11 | 
12 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
13 |         """
14 | 
15 |         :param vocab:
16 |             Vocabulary of the tokenizer
17 |         :param word_weights:
18 |             Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
19 |         :param unknown_word_weight:
20 |             Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
21 |         """
22 |         super(WordWeights, self).__init__()
23 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight']
24 |         self.vocab = vocab
25 |         self.word_weights = word_weights
26 |         self.unknown_word_weight = unknown_word_weight
27 | 
28 |         weights = []
29 |         num_unknown_words = 0
30 |         for word in vocab:
31 |             weight = unknown_word_weight
32 |             if word in word_weights:
33 |                 weight = word_weights[word]
34 |             elif word.lower() in word_weights:
35 |                 weight = word_weights[word.lower()]
36 |             else:
37 |                 num_unknown_words += 1
38 |             weights.append(weight)
39 |         
40 |         logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
41 | 
42 |         self.emb_layer = nn.Embedding(len(vocab), 1)
43 |         self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)})
44 | 
45 | 
46 |     def forward(self, features: Dict[str, Tensor]):
47 |         attention_mask = features['attention_mask']
48 |         token_embeddings = features['token_embeddings']
49 | 
50 |         #Compute a weight value for each token
51 |         token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1)
52 |         token_weights = token_weights_raw * attention_mask.float()
53 |         token_weights_sum = torch.sum(token_weights, 1)
54 | 
55 |         #Multiply embedding by token weight value
56 |         token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
57 |         token_embeddings = token_embeddings * token_weights_expanded
58 | 
59 |         features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum})
60 |         return features
61 | 
62 |     def get_config_dict(self):
63 |         return {key: self.__dict__[key] for key in self.config_keys}
64 | 
65 |     def save(self, output_path):
66 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
67 |             json.dump(self.get_config_dict(), fOut, indent=2)
68 | 
69 |     @staticmethod
70 |     def load(input_path):
71 |         with open(os.path.join(input_path, 'config.json')) as fIn:
72 |             config = json.load(fIn)
73 | 
74 |         return WordWeights(**config)


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/XLMRoBERTa.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class XLMRoBERTa(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/XLNet.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class XLNet(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .Transformer import Transformer
 2 | from .ALBERT import ALBERT
 3 | from .BERT import BERT
 4 | from .BoW import BoW
 5 | from .CNN import CNN
 6 | from .CamemBERT import CamemBERT
 7 | from .Dense import Dense
 8 | from .DistilBERT import DistilBERT
 9 | from .LSTM import LSTM
10 | from .Normalize import Normalize
11 | from .Pooling import Pooling
12 | from .RoBERTa import RoBERTa
13 | from .T5 import T5
14 | from .WKPooling import WKPooling
15 | from .WeightedLayerPooling import WeightedLayerPooling
16 | from .WordEmbeddings import WordEmbeddings
17 | from .WordWeights import WordWeights
18 | from .XLMRoBERTa import XLMRoBERTa
19 | from .XLNet import XLNet
20 | from .MLP3 import MLP3
21 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | 
 8 | class WhitespaceTokenizer(WordTokenizer):
 9 |     """
10 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
11 |     Punctuation are stripped from tokens.
12 |     """
13 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False):
14 |         self.stop_words = set(stop_words)
15 |         self.do_lower_case = do_lower_case
16 |         self.set_vocab(vocab)
17 | 
18 |     def get_vocab(self):
19 |         return self.vocab
20 | 
21 |     def set_vocab(self, vocab: Iterable[str]):
22 |         self.vocab = vocab
23 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
24 | 
25 |     def tokenize(self, text: str) -> List[int]:
26 |         if self.do_lower_case:
27 |             text = text.lower()
28 | 
29 |         tokens = text.split()
30 | 
31 |         tokens_filtered = []
32 |         for token in tokens:
33 |             if token in self.stop_words:
34 |                 continue
35 |             elif token in self.word2idx:
36 |                 tokens_filtered.append(self.word2idx[token])
37 |                 continue
38 | 
39 |             token = token.strip(string.punctuation)
40 |             if token in self.stop_words:
41 |                 continue
42 |             elif len(token) > 0 and token in self.word2idx:
43 |                 tokens_filtered.append(self.word2idx[token])
44 |                 continue
45 | 
46 |             token = token.lower()
47 |             if token in self.stop_words:
48 |                 continue
49 |             elif token in self.word2idx:
50 |                 tokens_filtered.append(self.word2idx[token])
51 |                 continue
52 | 
53 |         return tokens_filtered
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut:
57 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut)
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         return WhitespaceTokenizer(**config)
65 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
2 | from .WhitespaceTokenizer import WhitespaceTokenizer
3 | from .WhitespaceTokenizer import WhitespaceTokenizer


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/InputExample.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | 
 4 | class InputExample:
 5 |     """
 6 |     Structure for one input example with texts, the label and a unique id
 7 |     """
 8 |     def __init__(self, guid: str = '', texts: List[str] = None, texts_tokenized: List[List[int]] = None, label: Union[int, float] = 0):
 9 |         """
10 |         Creates one InputExample with the given texts, guid and label
11 | 
12 | 
13 |         :param guid
14 |             id for the example
15 |         :param texts
16 |             the texts for the example. Note, str.strip() is called on the texts
17 |         :param texts_tokenized
18 |             Optional: Texts that are already tokenized. If texts_tokenized is passed, texts must not be passed.
19 |         :param label
20 |             the label for the example
21 |         """
22 |         self.guid = guid
23 |         self.texts = [text.strip() for text in texts] if texts is not None else texts
24 |         self.texts_tokenized = texts_tokenized
25 |         self.label = label
26 | 
27 |     def __str__(self):
28 |         return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/LabelSentenceReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class LabelSentenceReader:
 7 |     """Reads in a file that has at least two columns: a label and a sentence.
 8 |     This reader can for example be used with the BatchHardTripletLoss.
 9 |     Maps labels automatically to integers"""
10 |     def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator='\t'):
11 |         self.folder = folder
12 |         self.label_map = {}
13 |         self.label_col_idx = label_col_idx
14 |         self.sentence_col_idx = sentence_col_idx
15 |         self.separator = separator
16 | 
17 |     def get_examples(self, filename, max_examples=0):
18 |         examples = []
19 | 
20 |         id = 0
21 |         for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
22 |             splits = line.strip().split(self.separator)
23 |             label = splits[self.label_col_idx]
24 |             sentence = splits[self.sentence_col_idx]
25 | 
26 |             if label not in self.label_map:
27 |                 self.label_map[label] = len(self.label_map)
28 | 
29 |             label_id = self.label_map[label]
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
33 | 
34 |             if 0 < max_examples <= id:
35 |                 break
36 | 
37 |         return examples
38 | 


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/NLIDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | 
 7 | class NLIDataReader(object):
 8 |     """
 9 |     Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
10 |     """
11 |     def __init__(self, dataset_folder):
12 |         self.dataset_folder = dataset_folder
13 | 
14 |     def get_examples(self, filename, max_examples=0):
15 |         """
16 |         data_splits specified which data split to use (train, dev, test).
17 |         Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
18 |         labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
19 |         """
20 |         s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
21 |                        mode="rt", encoding="utf-8").readlines()
22 |         s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
23 |                        mode="rt", encoding="utf-8").readlines()
24 |         labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
25 |                            mode="rt", encoding="utf-8").readlines()
26 | 
27 |         examples = []
28 |         id = 0
29 |         for sentence_a, sentence_b, label in zip(s1, s2, labels):
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
33 | 
34 |             if 0 < max_examples <= len(examples):
35 |                 break
36 | 
37 |         return examples
38 | 
39 |     @staticmethod
40 |     def get_labels():
41 |         return {"contradiction": 0, "entailment": 1, "neutral": 2}
42 | 
43 |     def get_num_labels(self):
44 |         return len(self.get_labels())
45 | 
46 |     def map_label(self, label):
47 |         return self.get_labels()[label.strip().lower()]


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/PairedFilesReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | import gzip
 6 | 
 7 | class PairedFilesReader(object):
 8 |     """
 9 |     Reads in the a Pair Dataset, split in two files
10 |     """
11 |     def __init__(self, filepaths):
12 |         self.filepaths = filepaths
13 | 
14 | 
15 |     def get_examples(self, max_examples=0):
16 |         """
17 |         """
18 |         fIns = []
19 |         for filepath in self.filepaths:
20 |             fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8')
21 |             fIns.append(fIn)
22 | 
23 |         examples = []
24 | 
25 |         eof = False
26 |         while not eof:
27 |             texts = []
28 |             for fIn in fIns:
29 |                 text = fIn.readline()
30 | 
31 |                 if text == '':
32 |                     eof = True
33 |                     break
34 | 
35 |                 texts.append(text)
36 | 
37 |             if eof:
38 |                 break;
39 | 
40 |             examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
41 |             if max_examples > 0 and len(examples) >= max_examples:
42 |                 break
43 | 
44 |         return examples


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/STSDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class STSDataReader:
 7 |     """
 8 |     Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
 9 | 
10 |     Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
11 |     """
12 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t",
13 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
14 |         self.dataset_folder = dataset_folder
15 |         self.score_col_idx = score_col_idx
16 |         self.s1_col_idx = s1_col_idx
17 |         self.s2_col_idx = s2_col_idx
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 |         self.normalize_scores = normalize_scores
21 |         self.min_score = min_score
22 |         self.max_score = max_score
23 | 
24 |     def get_examples(self, filename, max_examples=0):
25 |         """
26 |         filename specified which data split to use (train.csv, dev.csv, test.csv).
27 |         """
28 |         filepath = os.path.join(self.dataset_folder, filename)
29 |         with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn:
30 |             data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
31 |             examples = []
32 |             for id, row in enumerate(data):
33 |                 score = float(row[self.score_col_idx])
34 |                 if self.normalize_scores:  # Normalize to a 0...1 value
35 |                     score = (score - self.min_score) / (self.max_score - self.min_score)
36 | 
37 |                 s1 = row[self.s1_col_idx]
38 |                 s2 = row[self.s2_col_idx]
39 |                 examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))
40 | 
41 |                 if max_examples > 0 and len(examples) >= max_examples:
42 |                     break
43 | 
44 |         return examples
45 | 
46 | class STSBenchmarkDataReader(STSDataReader):
47 |     """
48 |     Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
49 |     Scores are normalized from 0...5 to 0...1
50 |     """
51 |     def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
52 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
53 |         super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter=delimiter,
54 |                  quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score)


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/TripletReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class TripletReader(object):
 7 |     """
 8 |     Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
 9 |     one positive example (s2) and one negative example (s3)
10 |     """
11 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t",
12 |                  quoting=csv.QUOTE_NONE):
13 |         self.dataset_folder = dataset_folder
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.s3_col_idx = s3_col_idx
17 |         self.has_header = has_header
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 | 
21 |     def get_examples(self, filename, max_examples=0):
22 |         """
23 | 
24 |         """
25 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter,
26 |                           quoting=self.quoting)
27 |         examples = []
28 |         if self.has_header:
29 |             next(data)
30 | 
31 |         for id, row in enumerate(data):
32 |             s1 = row[self.s1_col_idx]
33 |             s2 = row[self.s2_col_idx]
34 |             s3 = row[self.s3_col_idx]
35 | 
36 |             examples.append(InputExample(texts=[s1, s2, s3]))
37 |             if max_examples > 0 and len(examples) >= max_examples:
38 |                 break
39 | 
40 |         return examples


--------------------------------------------------------------------------------
/nlp/sentence_transformers/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .InputExample import InputExample
2 | from .LabelSentenceReader import LabelSentenceReader
3 | from .NLIDataReader import NLIDataReader
4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader
5 | from .TripletReader import TripletReader


--------------------------------------------------------------------------------
/nlp/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__
 6 |     Author: czh
 7 |     Create Date: 2021/8/12
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/tools/accelerate_tracker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/9/21 14:20
 7 | """
 8 | import wandb
 9 | from accelerate.tracking import GeneralTracker
10 | from accelerate.logging import get_logger
11 | from typing import Optional
12 | 
13 | logger = get_logger(__name__)
14 | 
15 | 
16 | class CustomWandbTracker(GeneralTracker):
17 |     name = "wandb"
18 |     requires_logging_directory = False
19 | 
20 |     def __init__(self, run_name: str, **kwargs):
21 |         self.run_name = run_name
22 | 
23 |         self.run = wandb.init(name=self.run_name, **kwargs)
24 |         logger.info(f"Initialized WandB project {self.run_name}")
25 |         logger.info(
26 |             "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
27 |         )
28 | 
29 |     @property
30 |     def tracker(self):
31 |         return self.run.run
32 | 
33 |     def store_init_configuration(self, values: dict):
34 |         """
35 |         Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
36 | 
37 |         Args:
38 |             values (Dictionary `str` to `bool`, `str`, `float` or `int`):
39 |                 Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
40 |                 `str`, `float`, `int`, or `None`.
41 |         """
42 |         wandb.config.update(values)
43 |         logger.info("Stored initial configuration hyperparameters to WandB")
44 | 
45 |     def log(self, values: dict, step: Optional[int], **kwargs):
46 |         """
47 |         Logs `values` to the current run.
48 | 
49 |         Args:
50 |             values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
51 |                 Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
52 |                 `str` to `float`/`int`.
53 |             step (`int`, *optional*):
54 |                 The run step. If included, the log will be affiliated with this step.
55 |         """
56 |         wandb.log(values, step=step, **kwargs)
57 | 
58 |     def finish(self):
59 |         """
60 |         Closes `wandb` writer
61 |         """
62 |         self.run.finish()
63 |         logger.info("WandB run closed")
64 | 


--------------------------------------------------------------------------------
/nlp/tools/convert_nezha_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: convert_nezha_original_tf_checkpoint_to_pytorch
 6 |     Author: czh
 7 |     Create Date: 2021/8/18
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | # Convert ALBERT checkpoint.
13 | import argparse
14 | import logging
15 | import torch
16 | from nlp.models.nezha import NeZhaConfig, NeZhaForPreTraining, load_tf_weights_in_nezha
17 | 
18 | logging.basicConfig(level=logging.INFO)
19 | 
20 | 
21 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, nezha_config_file, pytorch_dump_path):
22 |     # Initialise PyTorch model
23 |     config = NeZhaConfig.from_json_file(nezha_config_file)
24 |     print("Building PyTorch model from configuration: {}".format(str(config)))
25 |     model = NeZhaForPreTraining(config)
26 |     # Load weights from tf checkpoint
27 |     load_tf_weights_in_nezha(model, tf_checkpoint_path)
28 |     # Save pytorch-model
29 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
30 |     state_dict = {k: v for k, v in model.state_dict().items() if 'relative_positions' not in k}
31 |     torch.save(state_dict, pytorch_dump_path)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     parser = argparse.ArgumentParser()
36 |     # Required parameters
37 |     parser.add_argument(
38 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
39 |     )
40 |     parser.add_argument(
41 |         "--nezha_config_file",
42 |         default=None,
43 |         type=str,
44 |         required=True,
45 |         help="The config json file corresponding to the pre-trained ALBERT model. \n"
46 |              "This specifies the model architecture.",
47 |     )
48 |     parser.add_argument(
49 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
50 |     )
51 |     args = parser.parse_args()
52 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.nezha_config_file, args.pytorch_dump_path)
53 | 
54 | 
55 | '''
56 | python convert_nezha_original_tf_checkpoint_to_pytorch.py \
57 |     --tf_checkpoint_path=./pretrained_models/nezha-large-www \
58 |     --nezha_config_file=./pretrained_models/nezha-large-www/config.json \
59 |     --pytorch_dump_path=./pretrained_models/nezha-large-www/pytorch_model.bin
60 | '''
61 | 


--------------------------------------------------------------------------------
/nlp/tools/convert_tf_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/11/11 15:12
 7 | """
 8 | from transformers.models.bert.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 9 | 
10 | # chinese_wobert_plus
11 | path = "/Users/chenzhihao/Downloads/chinese_wobert_plus_L-12_H-768_A-12"
12 | tf_checkpoint_path = path + "/bert_model.ckpt"
13 | bert_config_file = path + "/bert_config.json"
14 | pytorch_dump_path = "/Users/chenzhihao/Downloads/chinese_wobert_plus/pytorch_model.bin"
15 | 
16 | convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
17 |                                  pytorch_dump_path)
18 | 
19 | # chinese_wobert
20 | path = "/Users/chenzhihao/Downloads/chinese_wobert_L-12_H-768_A-12"
21 | tf_checkpoint_path = path + "/bert_model.ckpt"
22 | bert_config_file = path + "/bert_config.json"
23 | pytorch_dump_path = "/Users/chenzhihao/Downloads/chinese_wobert_base/pytorch_model.bin"
24 | 
25 | convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
26 |                                  pytorch_dump_path)
27 | 


--------------------------------------------------------------------------------
/nlp/tools/dataloader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: dataloader
 6 |     Author: czh
 7 |     Create Date: 2021/9/30
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | from torch.utils.data.dataloader import _SingleProcessDataLoaderIter, _MultiProcessingDataLoaderIter
13 | import random
14 | from torch.utils.data import Dataset, DataLoader
15 | from itertools import chain
16 | 
17 | 
18 | class BlockShuffleDataLoader(DataLoader):
19 |     def __init__(self, dataset: Dataset, sort_key, sort_bs_num=None, is_shuffle=True, **kwargs):
20 |         """
21 |         初始化函数，继承DataLoader类
22 |         Args:
23 |             dataset: Dataset类的实例，其中中必须包含dataset变量，并且该变量为一个list
24 |             sort_key: 排序函数，即使用dataset元素中哪一个变量的长度进行排序
25 |             sort_bs_num: 排序范围，即在多少个batch_size大小内进行排序，默认为None，表示对整个序列排序
26 |             is_shuffle: 是否对分块后的内容，进行随机打乱，默认为True
27 |             **kwargs:
28 |         """
29 |         assert isinstance(dataset.data_set, list), "dataset为Dataset类的实例，其中中必须包含dataset变量，并且该变量为一个list"
30 |         super().__init__(dataset, **kwargs)
31 |         self.sort_bs_num = sort_bs_num
32 |         self.sort_key = sort_key
33 |         self.is_shuffle = is_shuffle
34 | 
35 |     def __iter__(self):
36 |         self.dataset.data_set = self.block_shuffle(self.dataset.data_set, self.batch_size, self.sort_bs_num,
37 |                                                    self.sort_key, self.is_shuffle)
38 |         if self.num_workers == 0:
39 |             return _SingleProcessDataLoaderIter(self)
40 |         else:
41 |             return _MultiProcessingDataLoaderIter(self)
42 | 
43 |     @staticmethod
44 |     def block_shuffle(data, batch_size, sort_bs_num, sort_key, is_shuffle):
45 |         random.shuffle(data)
46 |         # 将数据按照batch_size大小进行切分
47 |         tail_data = [] if len(data) % batch_size == 0 else data[-len(data) % batch_size:]
48 |         data = data[:len(data) - len(tail_data)]
49 |         assert len(data) % batch_size == 0
50 |         # 获取真实排序范围
51 |         sort_bs_num = len(data) // batch_size if sort_bs_num is None else sort_bs_num
52 |         # 按照排序范围进行数据划分
53 |         data = [data[i:i + sort_bs_num * batch_size] for i in range(0, len(data), sort_bs_num * batch_size)]
54 |         # 在排序范围，根据排序函数进行降序排列
55 |         data = [sorted(i, key=sort_key, reverse=True) for i in data]
56 |         # 将数据根据batch_size获取batch_data
57 |         data = list(chain(*data))
58 |         data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
59 |         # 判断是否需要对batch_data序列进行打乱
60 |         if is_shuffle:
61 |             random.shuffle(data)
62 |         # 将tail_data填补回去
63 |         data = list(chain(*data)) + tail_data
64 |         return data
65 | 


--------------------------------------------------------------------------------
/nlp/tools/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.metrics import confusion_matrix
 4 | plt.switch_backend('agg')
 5 | 
 6 | 
 7 | def plot_confusion_matrix(y_true, y_pred, classes,
 8 |                           save_path, normalize=False, title=None,
 9 |                           cmap=plt.cm.Blues):
10 |     """
11 |     This function prints and plots the confusion matrix.
12 |     Normalization can be applied by setting `normalize=True`.
13 |     """
14 |     if not title:
15 |         if normalize:
16 |             title = 'Normalized confusion matrix'
17 |         else:
18 |             title = 'Confusion matrix, without normalization'
19 |     # Compute confusion matrix
20 |     cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
21 |     # Only use the labels that appear in the data
22 |     if normalize:
23 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
24 |         print("Normalized confusion matrix")
25 |     else:
26 |         print('Confusion matrix, without normalization')
27 |     # --- plot--- #
28 |     plt.rcParams['savefig.dpi'] = 200
29 |     plt.rcParams['figure.dpi'] = 200
30 |     plt.rcParams['figure.figsize'] = [20, 20]  # plot
31 |     plt.rcParams.update({'font.size': 10})
32 |     fig, ax = plt.subplots()
33 |     im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
34 |     # --- bar --- #
35 |     from mpl_toolkits.axes_grid1 import make_axes_locatable
36 |     divider = make_axes_locatable(ax)
37 |     cax = divider.append_axes("right", size="5%", pad=0.05)
38 |     plt.colorbar(im, cax=cax)
39 |     # --- bar --- #
40 |     # ax.figure.colorbar(im, ax=ax)
41 |     # We want to show all ticks...
42 |     ax.set(xticks=np.arange(cm.shape[1]),
43 |            yticks=np.arange(cm.shape[0]),
44 |            # ... and label them with the respective list entries
45 |            xticklabels=classes, yticklabels=classes,
46 |            title=title,
47 |            ylabel='True label',
48 |            xlabel='Predicted label')
49 | 
50 |     # Rotate the tick labels and set their alignment.
51 |     plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
52 |              rotation_mode="anchor")
53 |     # Loop over data dimensions and create text annotations.
54 |     fmt = '.2f' if normalize else 'd'
55 |     thresh = cm.max() / 2.
56 |     for i in range(cm.shape[0]):
57 |         for j in range(cm.shape[1]):
58 |             ax.text(j, i, format(cm[i, j], fmt),
59 |                     ha="center", va="center",
60 |                     color="white" if cm[i, j] > thresh else "black")
61 |     fig.tight_layout()
62 |     plt.savefig(save_path)
63 | 
64 | 
65 | # if __name__ == "__main__":
66 | #     y_true = ['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
67 | #     y_pred = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O','B-PER', 'I-PER', 'O']
68 | #     classes = ['O','B-MISC', 'I-MISC','B-PER', 'I-PER']
69 | #     save_path = './ner_confusion_matrix.png'
70 | #     plot_confusion_matrix(y_true,y_pred,classes,save_path)
71 | 


--------------------------------------------------------------------------------
/nlp/trainers/ChildTuningF.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: ChildTuningF
 6 |     Author: czh
 7 |     Create Date: 2021/11/11
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | # https://github.com/alibaba/AliceMind/tree/main/ChildTuning
13 | from transformers import Trainer
14 | from transformers.optimization import get_scheduler
15 | 
16 | from nlp.callback.optimizers.child_tuning_optimizer import ChildTuningAdamW
17 | 
18 | 
19 | class ChildTuningFTrainer(Trainer):
20 |     def __init__(self, **kwargs):
21 |         self.reserve_p = kwargs.pop('reserve_p')
22 |         self.mode = kwargs.pop('mode')
23 |         super().__init__(**kwargs)
24 | 
25 |     def create_optimizer_and_scheduler(self, num_training_steps: int):
26 |         """
27 |         Setup the optimizer and the learning rate scheduler.
28 | 
29 |         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
30 |         Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
31 |         """
32 |         if self.optimizer is None:
33 |             no_decay = ["bias", "LayerNorm.weight"]
34 |             optimizer_grouped_parameters = [
35 |                 {
36 |                     "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
37 |                     "weight_decay": self.args.weight_decay,
38 |                 },
39 |                 {
40 |                     "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
41 |                     "weight_decay": 0.0,
42 |                 },
43 |             ]
44 |             optimizer_cls = ChildTuningAdamW
45 |             optimizer_kwargs = {"betas": (self.args.adam_beta1, self.args.adam_beta2), "eps": self.args.adam_epsilon,
46 |                                 "lr": self.args.learning_rate}
47 |             self.optimizer = optimizer_cls(optimizer_grouped_parameters, reserve_p=self.reserve_p,   # noqa
48 |                                            mode=self.mode, **optimizer_kwargs)
49 | 
50 |         if self.lr_scheduler is None:
51 |             self.lr_scheduler = get_scheduler(  # noqa
52 |                 self.args.lr_scheduler_type,
53 |                 self.optimizer,
54 |                 num_warmup_steps=self.args.warmup_steps,
55 |                 num_training_steps=num_training_steps,
56 |             )
57 | 


--------------------------------------------------------------------------------
/nlp/trainers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__.py
 6 |     Author: czh
 7 |     Create Date: 2021/11/11
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: __init__
 6 |     Author: czh
 7 |     Create Date: 2021/8/12
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 


--------------------------------------------------------------------------------
/nlp/utils/bert_or_thesues_repalcement_scheduler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: bert_or_thesues_repalcement_scheduler
 6 |     Author: czh
 7 |     Create Date: 2021/8/12
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | # bert_of_thesues 的replacement scheduler
13 | from nlp.models.distill_model import BertThesuesEncoder
14 | 
15 | 
16 | class ConstantReplacementScheduler:
17 |     def __init__(self, bert_encoder: BertThesuesEncoder, replacing_rate, replacing_steps=None):
18 |         self.bert_encoder = bert_encoder
19 |         self.replacing_rate = replacing_rate
20 |         self.replacing_steps = replacing_steps
21 |         self.step_counter = 0
22 |         self.bert_encoder.set_replacing_rate(replacing_rate)
23 | 
24 |     def step(self):
25 |         self.step_counter += 1
26 |         if self.replacing_steps is None or self.replacing_rate == 1.0:
27 |             return self.replacing_rate
28 |         else:
29 |             if self.step_counter >= self.replacing_steps:
30 |                 self.bert_encoder.set_replacing_rate(1.0)
31 |                 self.replacing_rate = 1.0
32 |             return self.replacing_rate
33 | 
34 | 
35 | class LinearReplacementScheduler:
36 |     def __init__(self, bert_encoder: BertThesuesEncoder, base_replacing_rate, k):
37 |         self.bert_encoder = bert_encoder
38 |         self.base_replacing_rate = base_replacing_rate
39 |         self.step_counter = 0
40 |         self.k = k
41 |         self.bert_encoder.set_replacing_rate(base_replacing_rate)
42 | 
43 |     def step(self):
44 |         self.step_counter += 1
45 |         current_replacing_rate = min(self.k * self.step_counter + self.base_replacing_rate, 1.0)
46 |         self.bert_encoder.set_replacing_rate(current_replacing_rate)
47 |         return current_replacing_rate
48 | 


--------------------------------------------------------------------------------
/nlp/utils/enums.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # -*- coding:utf-8 _*- 
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class CaseNotSensitiveEnum(Enum):
 7 |     """大小写不敏感"""
 8 | 
 9 |     @classmethod
10 |     def _missing_(cls, name):
11 |         for member in cls:
12 |             if member.name.lower() == name.lower():
13 |                 return member
14 | 
15 |     @classmethod
16 |     def choices(cls):
17 |         return [k.value for k in list(cls)]
18 | 
19 | 
20 | class RunMode(Enum):
21 |     """
22 |     因为crf的计算loss和infer的逻辑是分开的，为了保证有些操作能够以更优的方式进行：
23 |         1.  训练时，只关注loss，train模式即可
24 |         2.  validation阶段需要打出val_loss和预测的metrics，所以会俩部分都需要。采用eval模式
25 |         3.  模型训练好使用时，其实只需要预测，不需要loss计算，采用infer
26 |     """
27 |     TRAIN = "train"
28 |     INFER = "infer"
29 |     EVAL = "eval"
30 | 
31 | 
32 | class DataType(Enum):
33 |     TRAIN = "train"
34 |     EVAL = "dev"
35 |     TEST = "test"
36 | 
37 | 
38 | class OptimizerEnum(CaseNotSensitiveEnum):
39 |     AdamW = "AdamW"
40 |     LAMB = "LAMB"
41 |     Adafactor = "Adafactor"
42 |     Adam = "Adam"
43 | 
44 | 
45 | class FP16OptLevel(CaseNotSensitiveEnum):
46 |     O1 = "O1"
47 |     O2 = "O2"
48 |     O3 = "O3"
49 |     O4 = "O4"
50 | 
51 | 
52 | class MatcherType(CaseNotSensitiveEnum):
53 |     AVG = "avg"
54 |     MIN = "min"
55 | 


--------------------------------------------------------------------------------
/nlp/utils/errors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | """
 3 | ======================================
 4 |     Project Name: NLP
 5 |     File Name: errors
 6 |     Author: czh
 7 |     Create Date: 2022/2/9
 8 | --------------------------------------
 9 |     Change Activity: 
10 | ======================================
11 | """
12 | 
13 | 
14 | class ParseSpanError(Exception):
15 |     pass
16 | 
17 | 
18 | class ParseEntityOffsetMappingError(ParseSpanError):
19 |     pass
20 | 
21 | 
22 | class EntityNumNotMatchError(ParseSpanError):
23 |     pass
24 | 


--------------------------------------------------------------------------------
/nlp/utils/factory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # -*- coding:utf-8 _*- 
 3 | 
 4 | from dataclasses import asdict
 5 | from functools import partial
 6 | from typing import Callable, List, Optional
 7 | 
 8 | from pydantic import dataclasses
 9 | 
10 | 
11 | @dataclasses.dataclass
12 | class BaseClass:
13 | 
14 |     def as_dict(self):
15 |         return asdict(self)
16 | 
17 | 
18 | @dataclasses.dataclass
19 | class GoldEntity(BaseClass):
20 |     start_index: int
21 |     end_index: int
22 | 
23 | 
24 | @dataclasses.dataclass
25 | class PredEntity(GoldEntity):
26 |     start_prob: float
27 |     end_prob: float
28 | 
29 | 
30 | @dataclasses.dataclass
31 | class PredRelation(BaseClass):
32 |     rel: int
33 |     rel_prob: float
34 | 
35 | 
36 | @dataclasses.dataclass
37 | class PredTuple(BaseClass):
38 |     rel: int
39 |     rel_prob: float
40 |     ents: List[Optional[PredEntity]]
41 | 
42 | 
43 | @dataclasses.dataclass
44 | class GoldTuple(BaseClass):
45 |     rel: int
46 |     ents: List[GoldEntity]
47 | 
48 | 
49 | class PartialWrapper:
50 |     """partial的类别封装，主要是将kwarg作为class的属性来处理"""
51 | 
52 |     def __init__(self, func: Callable, *args, **kwargs):
53 |         assert isinstance(func, Callable)
54 |         self.func = partial(func, *args, **kwargs)
55 |         for k, v in kwargs.items():
56 |             self.__setattr__(k, v)
57 | 
58 |     def __call__(self, *args, **kwargs):
59 |         return self.func(*args, **kwargs)
60 | 
61 | 
62 | class PydanticConfig:
63 |     arbitrary_types_allowed = True
64 | 


--------------------------------------------------------------------------------
/nlp/utils/generate_util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email: zhihao.chen@kuwo.cn
 6 | @date: 2022/7/29 15:44
 7 | """
 8 | import torch
 9 | from torch.nn import functional as nnf
10 | 
11 | 
12 | def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
13 |     """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
14 |         Args:
15 |             :param logits: logits distribution shape (vocabulary size)
16 |             :param top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
17 |             :param top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
18 |                 whose total probability mass is greater than or equal to the threshold top_p.
19 |                 In practice, we select the highest probability tokens whose cumulative probability mass exceeds
20 |                 the threshold top_p.
21 |             :param threshold: a minimal threshold to keep logits
22 |             :param filter_value:
23 |     """
24 |     assert logits.dim() == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
25 |     top_k = min(top_k, logits.size(-1))
26 |     if top_k > 0:
27 |         # Remove all tokens with a probability less than the last token in the top-k tokens
28 |         indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
29 |         logits[indices_to_remove] = filter_value
30 | 
31 |     if top_p > 0.0:
32 |         # Compute cumulative probabilities of sorted tokens
33 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
34 |         cumulative_probabilities = torch.cumsum(nnf.softmax(sorted_logits, dim=-1), dim=-1)
35 | 
36 |         # Remove tokens with cumulative probability above the threshold
37 |         sorted_indices_to_remove = cumulative_probabilities > top_p
38 |         # Shift the indices to the right to keep also the first token above the threshold
39 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
40 |         sorted_indices_to_remove[..., 0] = 0
41 | 
42 |         # Back to unsorted indices and set them to -infinity
43 |         indices_to_remove = sorted_indices[sorted_indices_to_remove]
44 |         logits[indices_to_remove] = filter_value
45 | 
46 |     indices_to_remove = logits < threshold
47 |     logits[indices_to_remove] = filter_value
48 | 
49 |     return logits
50 | 


--------------------------------------------------------------------------------
/nlp/utils/selection_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email: zhihao.chen@kuwo.cn
 6 | @date: 2022/9/21 18:03
 7 | """
 8 | # 用于存储和加载training dynamic
 9 | # 参考自：https://github.com/allenai/cartography/blob/main/cartography/selection/selection_utils.py
10 | import json
11 | import logging
12 | import os
13 | import pandas as pd
14 | import tqdm
15 | 
16 | from typing import List
17 | 
18 | logging.basicConfig(
19 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO
20 | )
21 | logger = logging.getLogger(__name__)
22 | 
23 | 
24 | def log_training_dynamics(output_dir: os.path,
25 |                           epoch: int,
26 |                           train_ids: List[int],
27 |                           train_logits: List[List[float]],
28 |                           train_golds: List[int]):
29 |     """
30 |   Save training dynamics (logits) from given epoch as records of a `.jsonl` file.
31 |   """
32 |     td_df = pd.DataFrame({"guid": train_ids,
33 |                           f"logits_epoch_{epoch}": train_logits,
34 |                           "gold": train_golds})
35 | 
36 |     logging_dir = os.path.join(output_dir, f"training_dynamics")
37 |     # Create directory for logging training dynamics, if it doesn't already exist.
38 |     if not os.path.exists(logging_dir):
39 |         os.makedirs(logging_dir)
40 |     epoch_file_name = os.path.join(logging_dir, f"dynamics_epoch_{epoch}.json")
41 |     td_df.to_json(epoch_file_name, lines=True, orient="records")
42 |     logger.info(f"Training Dynamics logged to {epoch_file_name}")
43 | 
44 | 
45 | def read_training_dynamics(model_dir: os.path,
46 |                            strip_last: bool = False,
47 |                            id_field: str = "guid",
48 |                            burn_out: int = None):
49 |     """
50 |   Given path to logged training dynamics, merge stats across epochs.
51 |   Returns:
52 |   - Dict between ID of a train instances and its gold label, and the list of logits across epochs.
53 |   """
54 |     train_dynamics = {}
55 | 
56 |     td_dir = os.path.join(model_dir, "training_dynamics")
57 |     num_epochs = len([f for f in os.listdir(td_dir) if os.path.isfile(os.path.join(td_dir, f))])
58 |     if burn_out:
59 |         num_epochs = burn_out
60 | 
61 |     logger.info(f"Reading {num_epochs} files from {td_dir} ...")
62 |     for epoch_num in tqdm.tqdm(range(num_epochs)):
63 |         epoch_file = os.path.join(td_dir, f"dynamics_epoch_{epoch_num}.json")
64 |         assert os.path.exists(epoch_file)
65 | 
66 |         with open(epoch_file, "r", encoding='utf8') as infile:
67 |             for line in infile:
68 |                 line = line.strip()
69 |                 if not line:
70 |                     continue
71 |                 record = json.loads(line.strip())
72 |                 guid = record[id_field] if not strip_last else record[id_field][:-1]
73 |                 if guid not in train_dynamics:
74 |                     assert epoch_num == 0
75 |                     train_dynamics[guid] = {"gold": record["gold"], "logits": []}
76 |                 train_dynamics[guid]["logits"].append(record[f"logits_epoch_{epoch_num}"])
77 | 
78 |     logger.info(f"Read training dynamics for {len(train_dynamics)} train instances.")
79 |     return train_dynamics
80 | 


--------------------------------------------------------------------------------
/nlp/utils/util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/11/3 17:39
 7 | """
 8 | import torch.nn as nn
 9 | 
10 | 
11 | # Code widely inspired from:
12 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
13 | def masked_softmax(tensor, mask):
14 |     """
15 |     Apply a masked softmax on the last dimension of a tensor.
16 |     The input tensor and mask should be of size (batch, *, sequence_length).
17 |     Args:
18 |         tensor: The tensor on which the softmax function must be applied along
19 |             the last dimension.
20 |         mask: A mask of the same size as the tensor with 0s in the positions of
21 |             the values that must be masked and 1s everywhere else.
22 |     Returns:
23 |         A tensor of the same size as the inputs containing the result of the
24 |         softmax.
25 |     """
26 |     tensor_shape = tensor.size()
27 |     reshaped_tensor = tensor.view(-1, tensor_shape[-1])
28 | 
29 |     # Reshape the mask so it matches the size of the input tensor.
30 |     while mask.dim() < tensor.dim():
31 |         mask = mask.unsqueeze(1)
32 |     mask = mask.expand_as(tensor).contiguous().float()
33 |     reshaped_mask = mask.view(-1, mask.size()[-1])
34 | 
35 |     result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1)
36 |     result = result * reshaped_mask
37 |     # 1e-13 is added to avoid divisions by zero.
38 |     result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
39 | 
40 |     return result.view(*tensor_shape)
41 | 
42 | 
43 | # Code widely inspired from:
44 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
45 | def weighted_sum(tensor, weights, mask):
46 |     """
47 |     Apply a weighted sum on the vectors along the last dimension of 'tensor',
48 |     and mask the vectors in the result with 'mask'.
49 |     Args:
50 |         tensor: A tensor of vectors on which a weighted sum must be applied.
51 |         weights: The weights to use in the weighted sum.
52 |         mask: A mask to apply on the result of the weighted sum.
53 |     Returns:
54 |         A new tensor containing the result of the weighted sum after the mask
55 |         has been applied on it.
56 |     """
57 |     weight_sum = weights.bmm(tensor)
58 | 
59 |     while mask.dim() < weight_sum.dim():
60 |         mask = mask.unsqueeze(1)
61 |     mask = mask.transpose(-1, -2)
62 |     mask = mask.expand_as(weight_sum).contiguous().float()
63 | 
64 |     return weight_sum * mask
65 | 


--------------------------------------------------------------------------------
/nlp/utils/vat_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/10/19 16:04
 7 | """
 8 | # https://github.com/amazon-research/sentence-representations/blob/main/VaSCL/learners/vat_utils.py
 9 | import contextlib
10 | import torch
11 | import torch.nn as nn
12 | 
13 | 
14 | @contextlib.contextmanager
15 | def _disable_tracking_bn_stats(model):
16 |     def switch_attr(m):
17 |         if hasattr(m, 'track_running_stats'):
18 |             m.track_running_stats ^= True
19 | 
20 |     model.apply(switch_attr)
21 |     yield
22 |     model.apply(switch_attr)
23 | 
24 | 
25 | def _l2_normalize(d, attention_mask=None):
26 |     if attention_mask is not None:
27 |         attention_mask = attention_mask.unsqueeze(-1)
28 |         d *= attention_mask
29 |     d_reshaped = d.view(d.shape[0], -1, *(1 for _ in range(d.dim() - 2)))
30 |     d /= torch.norm(d_reshaped, dim=1, keepdim=True) + 1e-8
31 |     #     print("_l2_normalize, BEFORE:{} \t AFTER:{}".format(d.size(), d_reshaped.size()))
32 |     return d
33 | 
34 | 
35 | def _emb_norm(emb):
36 |     e_reshaped = emb.view(emb.shape[0], -1, *(1 for _ in range(emb.dim() - 2)))
37 |     enorm = torch.norm(e_reshaped, dim=1, keepdim=False) + 1e-8
38 |     #     print("BEFORE:{} \t AFTER:{}".format(emb.size(), e_reshaped.size()))
39 |     #     print("enorm:{}, {}".format(enorm.size(), enorm[:10]))
40 |     return enorm
41 | 
42 | 
43 | class VaSCLPturb(nn.Module):
44 |     def __init__(self, xi=0.1, eps=1, ip=1, uni_criterion=None, bi_criterion=None):
45 |         """VaSCL_Pturb on Transformer embeddings
46 |             :param xi: hyperparameter of VaSCL_Pturb (default: 10.0)
47 |             :param eps: hyperparameter of VaSCL_Pturb (default: 1.0)
48 |             :param ip: iteration times of computing adv noise (default: 1)
49 |         """
50 |         super(VaSCLPturb, self).__init__()
51 |         self.xi = xi
52 |         self.eps = eps
53 |         self.ip = ip
54 |         self.delta = 1e-08
55 | 
56 |         self.uni_criterion = uni_criterion
57 |         self.bi_criterion = bi_criterion
58 |         print("\n VaSCL_Pturb on embeddings, xi:{}, eps:{} \n".format(xi, eps))
59 | 
60 |     def forward(self, model, inputs, hard_indices):
61 |         #         print(inputs.size(), "\n", _emb_norm(inputs)[:5])
62 |         with torch.no_grad():
63 |             cnst = model.contrast_logits(inputs)
64 | 
65 |         # prepare random unit tensor
66 |         d = torch.rand(inputs.shape).sub(0.5).to(inputs.device)
67 |         d = _l2_normalize(d)
68 | 
69 |         with _disable_tracking_bn_stats(model):
70 |             # calc adversarial direction
71 |             for _ in range(self.ip):
72 |                 d.requires_grad_()
73 |                 cnst_hat = model.contrast_logits(inputs + self.xi * d)
74 | 
75 |                 adv_cnst = self.uni_criterion(cnst, cnst_hat, hard_indices)
76 |                 adv_distance = adv_cnst['lds_loss']
77 | 
78 |                 adv_distance.backward(retain_graph=True)
79 |                 d = _l2_normalize(d.grad)
80 |                 model.zero_grad()
81 | 
82 |         cnst_hat = model.contrast_logits(inputs + self.eps * d)
83 |         adv_cnst = self.bi_criterion(cnst, cnst_hat, hard_indices)
84 |         return adv_cnst
85 | 


--------------------------------------------------------------------------------
/nlp/utils/whitening_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email: zhihao.chen@kuwo.cn
 6 | @date: 2022/6/30 10:56
 7 | """
 8 | import pickle
 9 | import numpy as np
10 | 
11 | 
12 | def get_embedding(output, pooling_strategy='cls'):
13 |     hidden_states = output.hidden_states
14 |     if pooling_strategy == 'cls':
15 |         output_hidden_state = output.last_hidden_state[:, 0, :]
16 |     elif pooling_strategy == 'last_avg':
17 |         output_hidden_state = output.last_hidden_state.mean(dim=1)
18 |     elif pooling_strategy == 'first_last_avg':
19 |         output_hidden_state = hidden_states[-1] + hidden_states[1]
20 |         output_hidden_state = output_hidden_state.mean(dim=1)
21 |     elif pooling_strategy == 'last2avg':
22 |         output_hidden_state = hidden_states[-1] + hidden_states[-2]
23 |         output_hidden_state = output_hidden_state.mean(dim=1)
24 |     else:
25 |         raise ValueError("'pooling_strategy' must one of [fist-last-avg, last-avg, last2avg, cls]")
26 |     vec = output_hidden_state.cpu().numpy()[0]
27 |     return vec
28 | 
29 | 
30 | def compute_kernel_bias(vecs):
31 |     """计算kernel和bias
32 |     最后的变换：y = (x + bias).dot(kernel)
33 |     """
34 |     vecs = np.concatenate(vecs, axis=0)
35 |     mu = vecs.mean(axis=0, keepdims=True)
36 |     cov = np.cov(vecs.T)
37 |     u, s, vh = np.linalg.svd(cov)
38 |     w = np.dot(u, np.diag(s ** 0.5))
39 |     w = np.linalg.inv(w.T)
40 |     return w, -mu
41 | 
42 | 
43 | def normalize(vecs):
44 |     """标准化
45 |     """
46 |     return vecs / (vecs ** 2).sum(axis=1, keepdims=True) ** 0.5
47 | 
48 | 
49 | def transform_and_normalize(vecs, kernel, bias, dim):
50 |     """应用变换，然后标准化
51 |     """
52 |     if not (kernel is None or bias is None):
53 |         vecs = (vecs + bias).dot(kernel[:, :dim])
54 |     return normalize(vecs)
55 | 
56 | 
57 | def save_whiten(weight_save_path, kernel, bias):
58 |     whiten = {
59 |         'kernel': kernel,
60 |         'bias': bias
61 |     }
62 |     with open(weight_save_path, 'wb') as f:
63 |         pickle.dump(whiten, f)
64 | 
65 | 
66 | def load_whiten(weight_save_path):
67 |     with open(weight_save_path, 'rb') as f:
68 |         whiten = pickle.load(f)
69 |     kernel = whiten['kernel']
70 |     bias = whiten['bias']
71 |     return kernel, bias
72 | 


--------------------------------------------------------------------------------
/nlp/utils/wobert_tokenization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: czh
 5 | @email:
 6 | @date: 2022/11/11 15:07
 7 | """
 8 | import jieba
 9 | from transformers import BasicTokenizer, BertTokenizer
10 | 
11 | 
12 | class CustomBasicTokenizer(BasicTokenizer):
13 |     def __init__(self,
14 |                  vocab,
15 |                  do_lower_case=True,
16 |                  never_split=None,
17 |                  tokenize_chinese_chars=True,
18 |                  strip_accents=None):
19 |         super().__init__(do_lower_case=do_lower_case,
20 |                          never_split=never_split,
21 |                          tokenize_chinese_chars=tokenize_chinese_chars,
22 |                          strip_accents=strip_accents)
23 | 
24 |         self.vocab = vocab
25 | 
26 |     def _tokenize_chinese_chars(self, text):
27 |         output = []
28 |         '''
29 |         1、输入一个句子s，用pre_tokenize先分一次词，得到[w1,w2,…,wl]；
30 |         2、遍历各个wi，如果wi在词表中则保留，否则将wi用BERT自带的tokenize函数再分一次；
31 |         3、将每个wi的tokenize结果有序拼接起来，作为最后的tokenize结果。
32 |         '''
33 |         for wholeword in jieba.cut(text, HMM=False):
34 |             if wholeword in self.vocab:
35 |                 output.append(" ")
36 |                 output.append(wholeword)
37 |                 output.append(" ")
38 |             else:
39 |                 for char in wholeword:
40 |                     cp = ord(char)
41 |                     if self._is_chinese_char(cp):
42 |                         output.append(" ")
43 |                         output.append(char)
44 |                         output.append(" ")
45 |                     else:
46 |                         output.append(char)
47 |         return "".join(output)
48 | 
49 | 
50 | class WoBertTokenizer(BertTokenizer):
51 |     def __init__(self,
52 |                  vocab_file,
53 |                  do_lower_case=True,
54 |                  do_basic_tokenize=True,
55 |                  never_split=None,
56 |                  unk_token="[UNK]",
57 |                  sep_token="[SEP]",
58 |                  pad_token="[PAD]",
59 |                  cls_token="[CLS]",
60 |                  mask_token="[MASK]",
61 |                  tokenize_chinese_chars=True,
62 |                  strip_accents=None,
63 |                  **kwargs):
64 |         super().__init__(vocab_file,
65 |                          do_lower_case=do_lower_case,
66 |                          do_basic_tokenize=do_basic_tokenize,
67 |                          never_split=never_split,
68 |                          unk_token=unk_token,
69 |                          sep_token=sep_token,
70 |                          pad_token=pad_token,
71 |                          cls_token=cls_token,
72 |                          mask_token=mask_token,
73 |                          tokenize_chinese_chars=tokenize_chinese_chars,
74 |                          strip_accents=strip_accents,
75 |                          **kwargs)
76 |         if self.do_basic_tokenize:
77 |             self.basic_tokenizer = CustomBasicTokenizer(
78 |                 vocab=self.vocab,
79 |                 do_lower_case=do_lower_case,
80 |                 never_split=never_split,
81 |                 tokenize_chinese_chars=tokenize_chinese_chars,
82 |                 strip_accents=strip_accents,
83 |             )
84 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | numpy
 3 | pydantic
 4 | seqeval
 5 | torch
 6 | transformers
 7 | networkx
 8 | stanza
 9 | bert4keras
10 | Keras~=2.3.1
11 | regex
12 | scikit-learn
13 | requests
14 | tqdm
15 | six
16 | scipy
17 | nltk
18 | PyYAML
19 | sentence_transformers


--------------------------------------------------------------------------------