├── .DS_Store ├── .gitignore ├── README.md ├── conf ├── config.yml ├── lora_config_bloom.json └── lora_config_llama.json ├── datas ├── README.md ├── cluener │ ├── README.md │ ├── dev.json │ ├── dev.txt │ ├── json_to_text.py │ ├── test.json │ ├── train.json │ └── train.txt ├── convert_dataset.py ├── process_datas.py ├── spnre_business_chance │ ├── dev.json │ └── train.json └── text_to_json.py ├── experiments ├── .DS_Store ├── cartography │ └── BQ_roberta-wwm.pdf ├── configs │ ├── tplinker │ │ ├── README.md │ │ ├── build_datasets.py │ │ ├── data_config.yaml │ │ ├── tplinker_config.py │ │ ├── tplinker_eval.yaml │ │ ├── tplinker_plus_config.py │ │ └── tplinker_train.yaml │ └── tplinker_plus │ │ ├── config.py │ │ ├── eval_config.yaml │ │ └── train_config.yaml ├── event_extraction │ ├── sujianlin │ │ ├── eval_model.py │ │ ├── predict.py │ │ └── train_model.py │ └── tplinker │ │ └── train_and_valid.py ├── ner │ ├── cluener_globalpointer.py │ ├── evaluate_tplinker_ner.py │ ├── lear_for_ner.py │ ├── mrc_for_ner.py │ ├── run_ner_crf.py │ ├── run_ner_softmax.py │ ├── run_ner_span.py │ ├── run_ner_tplinker.py │ ├── run_ner_tplinker_v2.py │ └── train_globalpointer.py ├── others │ └── child_tuning_on_ner.py ├── qa_and_text_generation │ ├── finetune_cdail_gpt.py │ ├── finetune_cdail_gpt_2.py │ ├── finetune_cpm_large_2.py │ ├── finetune_cpm_large_accelerate.py │ ├── finetune_llm_chat.py │ ├── finetune_unilm_for_seq2seq_liadrinz.py │ └── finetune_unilm_for_seq2seq_yunwen.py ├── relation_extraction │ ├── casel_train.py │ ├── data_loader_casrel.py │ ├── framework.py │ ├── kg_globalpointer.py │ ├── train_ee.py │ ├── train_relation_extraction_by_globalpointer.py │ ├── train_spn4re.py │ ├── train_tplinker.py │ └── train_tplinker_plus.py ├── scripts │ ├── decode_yunwen_unilm_for_se2seq.sh │ ├── finetune_cpm_large_2.sh │ ├── finetune_cpm_large_accelerate.sh │ ├── finetune_liadrinz_unilm.sh │ ├── finetune_qwen_7b_qlora.sh │ ├── finetune_yunwen_unilm.sh │ ├── run_child_tuning_on_ner.sh │ ├── run_duee1_sujianlin.sh │ ├── run_finetune_cdail_gpt.sh │ ├── run_finetune_cdail_gpt_2.sh │ ├── run_global_pointer_for_ner.sh │ ├── run_ner_crf.sh │ ├── run_ner_softmax.sh │ ├── run_ner_span.sh │ ├── run_pair_sup_con.sh │ ├── run_sbert_training_dynamics.sh │ ├── run_spn4re.sh │ └── run_unsup_vascl.sh ├── sentence_embedding │ ├── run_bert_whitening.py │ ├── run_cross_encoder.py │ ├── run_cross_encoder_2.py │ ├── run_pair_sup_con.py │ ├── run_sentence_bert.py │ ├── run_sentence_bert_2.py │ ├── run_sentence_bert_accuracy.py │ ├── run_sentence_bert_pairscl.py │ ├── run_simces_2.py │ ├── run_sup_cosent.py │ ├── run_sup_cosent_accuracy.py │ ├── run_sup_simcse.py │ ├── run_unsup_consert.py │ ├── run_unsup_consert_2.py │ ├── run_unsup_consert_3.py │ ├── run_unsup_simcse.py │ ├── run_unsup_vascl.py │ └── train_sentence_bert_training_dynamics.py └── single_test │ ├── argument_test.py │ ├── bart_test.py │ ├── bert4keras_ee.py │ ├── bert4keras_model.py │ ├── bert_crop_model_test.py │ ├── chatglm_test.py │ ├── data_selection_for_training_dynamics.py │ ├── decode_yunwen_unilm_for_seq2seq.py │ ├── globalpointer_test_sujianlin.py │ ├── imagen_test.py │ ├── inference_cdail_gpt.py │ ├── inference_cpm.py │ ├── inference_liadrinz_unilm_for_seq2seq.py │ ├── inference_yunwen_unilm_for_seq2seq.py │ ├── paddlenlp_test.py │ ├── position_embedding_test.py │ ├── relation_extraction_globalpointer.py │ ├── roformer_sim_test.py │ ├── roformer_test.py │ ├── simcse_tool.py │ ├── task_conditional_language_model.py │ ├── task_seq2seq_autotitle.py │ ├── train_dynamics_filtering.py │ ├── unlim_test.py │ └── wobert_test.py ├── license.txt ├── nlp ├── __init__.py ├── arguments │ ├── __init__.py │ ├── data_arguments.py │ ├── model_arguments.py │ └── train_arguments.py ├── callback │ ├── __init__.py │ ├── adversarial.py │ ├── child_tuning_fisher.py │ ├── lr_finder.py │ ├── lr_scheduler.py │ ├── modelcheckpoint.py │ ├── optimizers │ │ ├── __init__.py │ │ ├── adabound.py │ │ ├── adafactor.py │ │ ├── adamw.py │ │ ├── child_tuning_optimizer.py │ │ ├── ema.py │ │ ├── lamb.py │ │ ├── lars.py │ │ ├── lookahead.py │ │ ├── nadam.py │ │ ├── novograd.py │ │ ├── planradam.py │ │ ├── radam.py │ │ ├── ralamb.py │ │ ├── ralars.py │ │ └── sgdw.py │ ├── progressbar.py │ └── trainingmonitor.py ├── event_extractor │ ├── __init__.py │ └── event_extractor.py ├── layers │ ├── __init__.py │ ├── cnn.py │ ├── crf.py │ ├── global_pointer.py │ ├── layer.py │ ├── linears.py │ └── position_embeddings.py ├── losses │ ├── __init__.py │ └── loss.py ├── metrics │ ├── __init__.py │ ├── bleu_metric.py │ ├── metric.py │ ├── sematic_match_metric.py │ ├── spn4re_metric.py │ ├── tplinker_metric.py │ └── triplet_distance_metric.py ├── models │ ├── __init__.py │ ├── bert_for_ee.py │ ├── bert_for_ee_tplinker.py │ ├── bert_for_ner.py │ ├── bert_for_relation_extraction.py │ ├── bert_model.py │ ├── bert_spn4re.py │ ├── bertcrop.py │ ├── distill_model.py │ ├── generate_model.py │ ├── idcnn_for_crf.py │ ├── model_util.py │ ├── nezha.py │ ├── sentence_embedding_models.py │ ├── tplinker_plus_for_ner.py │ ├── transformer.py │ ├── unilm_model_liadrinz.py │ └── unilm_model_yunwen.py ├── processors │ ├── __init__.py │ ├── dataset.py │ ├── ee_seq.py │ ├── ee_span.py │ ├── global_pointer_processor.py │ ├── ner_seq.py │ ├── ner_span.py │ ├── predict_process.py │ ├── preprocess.py │ ├── semantic_match_preprocessor.py │ ├── spn4ner_processor.py │ ├── unilm_liadrinz_processor.py │ ├── unlim_yunwen_preprocessor.py │ ├── utils_ee.py │ └── utils_ner.py ├── sentence_transformers │ ├── LoggingHandler.py │ ├── SentenceTransformer.py │ ├── __init__.py │ ├── cross_encoder │ │ ├── CrossEncoder.py │ │ ├── __init__.py │ │ └── evaluation │ │ │ ├── CEBinaryClassificationEvaluator.py │ │ │ ├── CECorrelationEvaluator.py │ │ │ ├── CESoftmaxAccuracyEvaluator.py │ │ │ └── __init__.py │ ├── datasets │ │ ├── EncodeDataset.py │ │ ├── ParallelSentencesDataset.py │ │ ├── SentenceLabelDataset.py │ │ ├── SentencesDataset.py │ │ ├── __init__.py │ │ └── sampler │ │ │ ├── LabelSampler.py │ │ │ └── __init__.py │ ├── evaluation │ │ ├── BinaryClassificationEvaluator.py │ │ ├── EmbeddingSimilarityEvaluator.py │ │ ├── InformationRetrievalEvaluator.py │ │ ├── LabelAccuracyEvaluator.py │ │ ├── MSEEvaluator.py │ │ ├── MSEEvaluatorFromDataFrame.py │ │ ├── ParaphraseMiningEvaluator.py │ │ ├── SentenceEvaluator.py │ │ ├── SequentialEvaluator.py │ │ ├── SimilarityFunction.py │ │ ├── TranslationEvaluator.py │ │ ├── TripletEvaluator.py │ │ └── __init__.py │ ├── losses │ │ ├── AdvCLSoftmaxLoss.py │ │ ├── AdvCLSoftmaxLoss_refactoring.py │ │ ├── AdvCLSoftmaxLoss_single_stream_backup.py │ │ ├── AdvSimSiamLoss.py │ │ ├── BatchAllTripletLoss.py │ │ ├── BatchHardSoftMarginTripletLoss.py │ │ ├── BatchHardTripletLoss.py │ │ ├── BatchSemiHardTripletLoss.py │ │ ├── ContrastiveLoss.py │ │ ├── CosineSimilarityLoss.py │ │ ├── MSELoss.py │ │ ├── MegaBatchMarginLoss.py │ │ ├── MultipleNegativesRankingLoss.py │ │ ├── OnlineContrastiveLoss.py │ │ ├── SimCLRLoss.py │ │ ├── SimSiamLoss.py │ │ ├── SoftmaxLoss.py │ │ ├── TripletLoss.py │ │ └── __init__.py │ ├── models │ │ ├── ALBERT.py │ │ ├── BERT.py │ │ ├── BoW.py │ │ ├── CNN.py │ │ ├── CamemBERT.py │ │ ├── Dense.py │ │ ├── DistilBERT.py │ │ ├── LSTM.py │ │ ├── MLP3.py │ │ ├── Normalize.py │ │ ├── Pooling.py │ │ ├── RoBERTa.py │ │ ├── T5.py │ │ ├── Transformer.py │ │ ├── WKPooling.py │ │ ├── WeightedLayerPooling.py │ │ ├── WordEmbeddings.py │ │ ├── WordWeights.py │ │ ├── XLMRoBERTa.py │ │ ├── XLNet.py │ │ ├── __init__.py │ │ └── tokenizer │ │ │ ├── PhraseTokenizer.py │ │ │ ├── WhitespaceTokenizer.py │ │ │ ├── WordTokenizer.py │ │ │ └── __init__.py │ ├── readers │ │ ├── InputExample.py │ │ ├── LabelSentenceReader.py │ │ ├── NLIDataReader.py │ │ ├── PairedFilesReader.py │ │ ├── STSDataReader.py │ │ ├── TripletReader.py │ │ └── __init__.py │ └── util.py ├── tools │ ├── __init__.py │ ├── accelerate_tracker.py │ ├── common.py │ ├── convert_nezha_original_tf_checkpoint_to_pytorch.py │ ├── convert_tf_to_pytorch.py │ ├── dataloader.py │ ├── file_util.py │ ├── format_conv.py │ ├── path.py │ ├── plot.py │ ├── pytorch_optimization.py │ └── utils.py ├── trainers │ ├── ChildTuningD.py │ ├── ChildTuningF.py │ └── __init__.py └── utils │ ├── __init__.py │ ├── bert_or_thesues_repalcement_scheduler.py │ ├── configuration_unilm.py │ ├── ee_arguments.py │ ├── enums.py │ ├── errors.py │ ├── factory.py │ ├── functions.py │ ├── generate_util.py │ ├── log_handler.py │ ├── official_tokenization.py │ ├── optimization.py │ ├── selection_utils.py │ ├── taggers.py │ ├── tokenization_unilm.py │ ├── tokenizers.py │ ├── tplinker_plus_ner_util.py │ ├── tplinker_plus_utils.py │ ├── tplinker_utils.py │ ├── util.py │ ├── vat_utils.py │ ├── whitening_utils.py │ └── wobert_tokenization.py └── requirements.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/** 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # test module 个人习惯,测试目录去掉 10 | /test/ 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | .idea/ 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | workspace.xml 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | .idea/workspace.xml 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | datas/kg/ 124 | datas/news2/ 125 | datas/ner/ 126 | datas/spn4re/ 127 | datas/tplinker/ 128 | .DS_Store/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## NLP相关的实验 2 | 本项目是自己做的nlp相关的实验,以及收集整理的一些方法。 3 | 暂时包括命名实体识别、实体关系抽取、事件抽取、语义匹配,以后会添加其它实验,例如分类、文本生成、问答系统等。 4 | 5 | ## 命名实体识别 6 | 相关实验在`experiments/ner`,运行脚本`experiments/scripts` 7 | 8 | ## 关系抽取 9 | 相关实验在`experiments/relation_extraction`,运行脚本`experiments/scripts` 10 | 11 | ## 事件抽取 12 | 相关实验在`experiments/event_extraction` 13 | 14 | ## 语义匹配 15 | 相关实验在`experiments/sentence_embedding` 16 | 17 | ## 文本生成模型和LLM模型微调 18 | 相关实验在`experiments/qa_and_text_generation` 19 | -------------------------------------------------------------------------------- /conf/config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/conf/config.yml -------------------------------------------------------------------------------- /conf/lora_config_bloom.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_r": 16, 3 | "lora_alpha": 32, 4 | "lora_dropout": 0.05, 5 | "lora_target_modules": [ 6 | "query_key_value", 7 | "mlp" 8 | ] 9 | } -------------------------------------------------------------------------------- /conf/lora_config_llama.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_r": 16, 3 | "lora_alpha": 32, 4 | "lora_dropout": 0.05, 5 | "lora_target_modules": [ 6 | "q_proj", 7 | "k_proj", 8 | "v_proj", 9 | "o_proj", 10 | "down_proj", 11 | "gate_proj", 12 | "up_proj" 13 | ] 14 | } -------------------------------------------------------------------------------- /datas/cluener/README.md: -------------------------------------------------------------------------------- 1 | CLUENER 细粒度命名实体识别 2 | 3 | 数据分为10个标签类别,分别为: 4 | 地址(address), 5 | 书名(book), 6 | 公司(company), 7 | 游戏(game), 8 | 政府(goverment), 9 | 电影(movie), 10 | 姓名(name), 11 | 组织机构(organization), 12 | 职位(position), 13 | 景点(scene) 14 | 15 | 数据详细介绍、基线模型和效果测评,见 https://github.com/CLUEbenchmark/CLUENER 16 | 17 | 技术讨论或问题,请项目中提issue或PR,或发送电子邮件到 ChineseGLUE@163.com 18 | 19 | 测试集上SOTA效果见榜单:www.CLUEbenchmark.com -------------------------------------------------------------------------------- /datas/cluener/json_to_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: BERT-NER-Pytorch 5 | File Name: json_to_text 6 | Author: czh 7 | Create Date: 2021/6/24 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import json 13 | 14 | 15 | file_names = ['train.json', 'dev.json'] 16 | for name in file_names: 17 | print("processing the file {}".format(name)) 18 | prefix = name.split('.')[0] 19 | with open(name, encoding="utf-8") as fr, open(prefix+'.txt', 'w', encoding='utf8') as fw: 20 | for line in fr: 21 | line = line.strip() 22 | if not line: 23 | continue 24 | data = json.loads(line) 25 | text = data['text'] 26 | tokens = list(text) 27 | labels = ['O' for _ in tokens] 28 | try: 29 | if data.get('label'): 30 | for label, dic in data['label'].items(): 31 | for token, lsts in dic.items(): 32 | for lst in lsts: 33 | labels[lst[0]] = "B-"+label 34 | labels[lst[0]+1:lst[-1]+1] = ["I-"+label]*(lst[-1]-lst[0]) 35 | except Exception as e: 36 | print(data) 37 | raise e 38 | for j,t in enumerate(tokens): 39 | fw.write(t+' '+labels[j]+'\n') 40 | fw.write('\n') 41 | print("has processed the file {}".format(name)) 42 | -------------------------------------------------------------------------------- /datas/text_to_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: text_to_json 6 | Author: czh 7 | Create Date: 2021/8/19 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | # 格式转换为{"text": "郑阿姨就赶到文汇路排队拿钱,希望能将缴纳的一万余元学费拿回来,顺便找校方或者教委要个说法。", "label": {"address": {"文汇路": [[6, 8]]}}} 13 | import codecs 14 | import json 15 | import numpy as np 16 | from nlp.processors import get_entities 17 | 18 | data_file = "ner/" 19 | 20 | 21 | def trans_data(data_type): 22 | file_name = data_file + f"{data_type}.txt" 23 | lines = [] 24 | with codecs.open(file_name, encoding='utf8') as fr: 25 | words = [] 26 | labels = [] 27 | for line in fr: 28 | if line.startswith("-DOCSTART-") or line == "" or line == "\n": 29 | if words: 30 | lines.append({"text": "".join(words), "labels": labels}) 31 | words = [] 32 | labels = [] 33 | else: 34 | splits = line.split(" ") 35 | words.append(splits[0]) 36 | if len(splits) > 1: 37 | labels.append(splits[-1].replace("\n", "")) 38 | else: 39 | # Examples could have no label for mode = "test" 40 | labels.append("O") 41 | if words: 42 | lines.append({"text": "".join(words), "labels": labels}) 43 | results = [] 44 | for item in lines: 45 | text = item["text"] 46 | labels = item["labels"] 47 | subjects = get_entities(labels, id2label=None, markup='bios') 48 | label_dict = {} 49 | for subject in subjects: 50 | label = subject[0] 51 | start = subject[1] 52 | end = subject[2] 53 | word = text[start: end+1] 54 | if label not in label_dict: 55 | label_dict[label] = {} 56 | if word not in label_dict[label]: 57 | label_dict[label][word] = [] 58 | label_dict[label][word].append([start, end]) 59 | results.append({"text": text, "label": label_dict}) 60 | 61 | return results 62 | 63 | 64 | def save_datas(datas, data_type): 65 | file_name = data_file + f"{data_type}.json" 66 | with codecs.open(file_name, 'w', encoding='utf8') as fw: 67 | for line in datas: 68 | line = json.dumps(line, ensure_ascii=False) 69 | fw.write(line + '\n') 70 | 71 | 72 | def main(): 73 | train_datas = trans_data("train") 74 | dev_datas = trans_data("dev") 75 | test_datas = trans_data("test") 76 | 77 | all_datas = train_datas + dev_datas + test_datas 78 | np.random.shuffle(all_datas) 79 | 80 | num = len(all_datas) 81 | print("total num: ", num) 82 | train_num = int(num * 0.8) 83 | train_datas = all_datas[:train_num] 84 | dev_datas = all_datas[train_num: train_num+int(num*0.1)] 85 | test_datas = all_datas[train_num+int(num*0.1):] 86 | 87 | print("train num: ", len(train_datas)) 88 | print("dev num: ", len(dev_datas)) 89 | print("test num: ", len(test_datas)) 90 | 91 | save_datas(train_datas, "train") 92 | save_datas(dev_datas, "dev") 93 | save_datas(test_datas, "test") 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /experiments/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/experiments/.DS_Store -------------------------------------------------------------------------------- /experiments/cartography/BQ_roberta-wwm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/experiments/cartography/BQ_roberta-wwm.pdf -------------------------------------------------------------------------------- /experiments/configs/tplinker/README.md: -------------------------------------------------------------------------------- 1 | 参考:https://github.com/131250208/TPlinker-joint-extraction -------------------------------------------------------------------------------- /experiments/configs/tplinker/data_config.yaml: -------------------------------------------------------------------------------- 1 | exp_name: NEWS # nyt_star, nyt, webnlg_star, webnlg, ace05_lu 2 | data_in_dir: ../datas 3 | ori_data_format: casrel # casrel (webnlg_star, nyt_star), etl_span (webnlg), raw_nyt (nyt), tplinker (see readme) 4 | 5 | # if build data for BERT, use these 3 lines and comment the following 2 lines. 6 | encoder: BERT 7 | bert_path: hfl/chinese-bert-wwm-ext # chinese-bert-wwm-ext-hit, bert-base-cased 8 | data_out_dir: ../datas/data4bert 9 | 10 | # # if build data for BiLSTM, use these 2 lines and comment above 3 lines. 11 | # encoder: BiLSTM 12 | # data_out_dir: ../data4bilstm 13 | 14 | # if only reproduce the results, no need to change the args below. 15 | # separate_char_by_white: e.g. "$%sdkn839," -> "$% sdkn839 ," , will make original char spans invalid 16 | # add_char_span: set add_char_span to false if it already exists 17 | # ignore_subword: when adding character level spans, match words with whitespace around: " word ", to avoid subword match, set false for chinese 18 | # check_tok_span: check whether there is any error with token spans, if there is, print the unmatch info 19 | add_char_span: true 20 | ignore_subword: false 21 | separate_char_by_white: false 22 | check_tok_span: true -------------------------------------------------------------------------------- /experiments/configs/tplinker/tplinker_eval.yaml: -------------------------------------------------------------------------------- 1 | exp_name: webnlg_single 2 | model_state_dict_dir: ./wandb 3 | run_ids: 4 | - 3k0y4z53 5 | 6 | last_k_model: 1 7 | 8 | 9 | test_data: "*test*.json" 10 | rel2id: rel2id.json 11 | 12 | device_num: 1 13 | 14 | # encoder: BERT 15 | # data_home: ../data4bert 16 | # bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased 17 | 18 | # for BiLSTM 19 | encoder: BiLSTM 20 | enc_hidden_size: 300 21 | dec_hidden_size: 600 22 | emb_dropout: 0.1 23 | rnn_dropout: 0.1 24 | word_embedding_dim: 300 25 | token2idx: token2idx.json 26 | data_home: ../data4bilstm 27 | 28 | batch_size: 32 29 | force_split: false 30 | max_test_seq_len: 512 31 | sliding_len: 20 32 | match_pattern: only_head_text 33 | shaking_type: cat 34 | # distance emb, ent_add_dist and rel_add_dist are valid only if dist_emb_size != -1 35 | dist_emb_size: 512 36 | ent_add_dist: true 37 | rel_add_dist: true 38 | 39 | # results 40 | save_res: false 41 | save_res_dir: ../results 42 | # score: set true only when test set tagged 43 | score: true -------------------------------------------------------------------------------- /experiments/configs/tplinker/tplinker_train.yaml: -------------------------------------------------------------------------------- 1 | exp_name: nyt_single 2 | run_name: TP1+Cat+BE 3 | 4 | train_data: train_data.json 5 | valid_data: valid_data.json 6 | rel2id: rel2id.json 7 | 8 | device_num: 1 9 | 10 | # set logger 11 | # if use default logger, must provide a log path and a path to save model, if use wandb, model state will be upload to the cloud 12 | # logger: wandb # wandb, default 13 | 14 | logger: default 15 | log_path: ./default.log 16 | path_to_save_model: ./model_state 17 | 18 | encoder: BERT 19 | data_home: ../data4bert 20 | bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased 21 | 22 | # encoder: BiLSTM 23 | # token2idx: token2idx.json 24 | # data_home: ../data4bilstm 25 | # pretrained_word_embedding_path: ../pretrained_word_emb/glove_300_nyt.emb 26 | 27 | hyper_parameters: 28 | batch_size: 24 29 | epochs: 200 30 | lr: 5e-5 31 | seed: 2333 32 | log_interval: 10 33 | max_seq_len: 100 34 | sliding_len: 20 35 | loss_weight_recover_steps: 10000 36 | shaking_type: cat 37 | # distance emb, ent_add_dist and rel_add_dist are valid only if dist_emb_size != -1 38 | dist_emb_size: -1 39 | ent_add_dist: false 40 | rel_add_dist: false 41 | match_pattern: only_head_text 42 | 43 | # CosineAnnealingWarmRestarts 44 | scheduler: CAWR # Step 45 | T_mult: 1 46 | rewarm_epoch_num: 2 47 | 48 | # # StepLR 49 | # scheduler: Step 50 | # decay_rate: 0.99 51 | # decay_steps: 100 52 | 53 | # # for BiLSTM 54 | # enc_hidden_size: 300 55 | # dec_hidden_size: 600 56 | # emb_dropout: 0.1 57 | # rnn_dropout: 0.1 58 | # word_embedding_dim: 300 59 | 60 | # when to save the model state dict 61 | f1_2_save: 0 62 | # whether train from scratch 63 | fr_scratch: true 64 | # note 65 | note: start from scratch 66 | # if not fr scratch, set a model_state_dict 67 | model_state_dict_path: stake -------------------------------------------------------------------------------- /experiments/configs/tplinker_plus/eval_config.yaml: -------------------------------------------------------------------------------- 1 | exp_name: covid19_rel_lianxiangjia 2 | model_state_dict_dir: ./wandb 3 | run_ids: 4 | # no dist 5 | # - 1w4mk6mr 6 | # - 3pdl0yv9 7 | # - e48u3t8g 8 | # - 14ooykvx 9 | # - 3tasch7j 10 | # + dist emb 11 | - 2s5jq1ho 12 | - llvea66v 13 | - 3mjun6d5 14 | - 31hmg1nh 15 | - 3r96671x 16 | 17 | last_k_model: 1 18 | 19 | data_home: ../data4bert 20 | test_data: "*test*.json" 21 | rel2id: rel2id.json 22 | 23 | device_num: 1 24 | 25 | encoder: BERT 26 | bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased 27 | 28 | # # for BiLSTM 29 | # enc_hidden_size: 128 30 | # dec_hidden_size: 256 31 | # emb_dropout: 0.1 32 | # rnn_dropout: 0.1 33 | # word_embedding_dim: 100 34 | # token2idx: token2idx.json 35 | # pretrained_word_embedding_path: ../pretrained_word_emb/glove_100_webnlg.emb 36 | 37 | batch_size: 16 38 | force_split: false 39 | max_test_seq_len: 200 40 | sliding_len: 50 41 | shaking_type: cln 42 | dist_emb_size: 512 43 | tok_pair_sample_rate: 1 44 | correct: whole_text 45 | 46 | # results 47 | save_res: true 48 | save_res_dir: ../results 49 | # score: set true only when test set tagged 50 | score: false 51 | -------------------------------------------------------------------------------- /experiments/configs/tplinker_plus/train_config.yaml: -------------------------------------------------------------------------------- 1 | exp_name: nyt 2 | run_name: TP2+Cat+Dist+BiLSTM 3 | 4 | train_data: train_data.json 5 | valid_data: valid_data.json 6 | rel2id: rel2id.json 7 | 8 | device_num: 0 9 | 10 | # set logger 11 | # if use default logger, must provide a log path and a path to save model, if use wandb, model state will be upload to the cloud 12 | logger: wandb # wandb, default 13 | 14 | # logger: default 15 | # log_path: ./default.log 16 | # path_to_save_model: ./model_state 17 | 18 | # encoder: BERT 19 | # data_home: ../data4bert 20 | # bert_path: /home/wangyucheng/opt/transformers_models_h5/bert-base-cased 21 | 22 | encoder: BiLSTM 23 | token2idx: token2idx.json 24 | data_home: ../data4bilstm 25 | pretrained_word_embedding_path: ../pretrained_word_emb/glove_300_nyt.emb 26 | 27 | hyper_parameters: 28 | batch_size: 24 29 | epochs: 200 30 | lr: 1e-3 31 | seed: 2333 32 | log_interval: 10 33 | max_seq_len: 100 34 | sliding_len: 20 35 | shaking_type: cat 36 | dist_emb_size: 512 37 | tok_pair_sample_rate: 1 38 | 39 | # CosineAnnealingWarmRestarts 40 | scheduler: CAWR # Step 41 | T_mult: 1 42 | rewarm_epoch_num: 2 43 | 44 | # # StepLR 45 | # scheduler: Step 46 | # decay_rate: 0.99 47 | # decay_steps: 100 48 | 49 | # scheduler: ReduceLROnPlateau 50 | 51 | # # for BiLSTM 52 | # enc_hidden_size: 128 53 | # dec_hidden_size: 256 54 | # emb_dropout: 0.1 55 | # rnn_dropout: 0.1 56 | # word_embedding_dim: 100 57 | 58 | # whether train from scratch 59 | fr_scratch: true 60 | note: start from scratch 61 | # when to save the model state dict 62 | f1_2_save: 0.0 63 | 64 | # if not, give a model_state_dict 65 | model_state_dict_path: stake -------------------------------------------------------------------------------- /experiments/event_extraction/sujianlin/eval_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: EventExtraction 5 | File Name: eval_model 6 | Author: czh 7 | Create Date: 2021/9/16 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from nlp.event_extractor.event_extractor import EventExtractor 13 | from nlp.utils.ee_arguments import DataAndTrainArguments 14 | 15 | config = { 16 | 'task_name': 'ee', # ee 17 | 'data_dir': '../data/normal_data/news2', 18 | 'model_type': 'bert', # bert, nezha 19 | 'model_name_or_path': 'hfl/chinese-roberta-wwm-ext', # nezha-base-wwm 20 | 'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录 21 | 'do_lower_case': False, # 主要是tokenize时是否将大写转为小写 22 | 'use_lstm': False, # 默认为False, 表示模型结构为bert_crf 23 | 'no_cuda': False, # 是否使用GPU。默认为False, 表示只使用CPU 24 | 'eval_max_seq_length': 128, # 默认为512 25 | 'per_gpu_eval_batch_size': 8, 26 | 'cuda_number': '0', # '0,1,2,3' 使用GPU时需指定GPU卡号 27 | } 28 | 29 | args = DataAndTrainArguments(**config) # noqa 30 | extractor = EventExtractor(args) 31 | 32 | # evaluate all checkpoints file for the dev datasets 33 | # extractor.evaluate(eval_all_checkpoints=True) 34 | 35 | # only evaluate best model for the dev datasets 36 | # extractor.evaluate() 37 | 38 | # evaluate all checkpoints file for the test datasets, and the test datasets sample must labeled 39 | # extractor.evaluate(data_type='test', eval_all_checkpoints=True) 40 | 41 | # only evaluate best model for the test datasets, and the test datasets sample must labeled 42 | extractor.evaluate(data_type='test') 43 | -------------------------------------------------------------------------------- /experiments/event_extraction/sujianlin/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: EventExtraction 5 | File Name: predict_raw_text 6 | Author: czh 7 | Create Date: 2021/9/16 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import json 13 | 14 | from nlp.event_extractor.event_extractor import EventExtractor 15 | from nlp.utils.ee_arguments import DataAndTrainArguments 16 | 17 | config = { 18 | 'task_name': 'ee', 19 | 'model_type': 'bert', 20 | 'use_lstm': True, # 默认是False 21 | 'eval_max_seq_length': 512, 22 | } 23 | 24 | args = DataAndTrainArguments(**config) # noqa 25 | extractor = EventExtractor(args, state='pred', model_path='../data/model') 26 | 27 | # data_type: 只能是'test',或者None。若为test则表示在测试数据集上预测 28 | # input_texts: 若不为空,则表示是预测新的数据 29 | # pred_output_dir: 若不为空,则表示将预测结果写入指定位置保存,可以是目录,也可以是文件 30 | 31 | # 表示在测试数据集上预测, 不保存预测结果 32 | # for res in extractor.predict(data_type='test'): 33 | # print(res) 34 | 35 | # 表示在测试数据集上预测, 保存预测结果 36 | # for res in extractor.predict(data_type='test', pred_output_dir="../data/output/bert"): 37 | # print(res) 38 | 39 | # 表示预测raw text, raw text可以是str, List[str] 40 | # texts = "博盛医疗完成Pre-A轮融资澳银资本重点参与" 41 | texts = ["博盛医疗完成Pre-A轮融资澳银资本重点参与", 42 | "百炼智能完成A轮一亿元融资,由今日头条领投"] 43 | for res in extractor.predict(input_texts=texts): 44 | print(json.dumps(res, ensure_ascii=False, indent=2)) 45 | -------------------------------------------------------------------------------- /experiments/event_extraction/sujianlin/train_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: EventExtraction 5 | File Name: train_model 6 | Author: czh 7 | Create Date: 2021/9/15 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from nlp.event_extractor.event_extractor import EventExtractor 13 | from nlp.utils.ee_arguments import DataAndTrainArguments 14 | 15 | config = { 16 | 'task_name': 'ner', # ner 17 | 'data_dir': '../data/normal_data/ner', 18 | 'model_type': 'bert', # bert, nezha 19 | 'model_name_or_path': 'hfl/chinese-roberta-wwm-ext', # '/data/chenzhihao/nezha-base-www' 20 | 'model_sate_dict_path': '../data/output/bert/best_model', # 保存的checkpoint文件地址用于继续训练 21 | 'output_dir': '../data/output/', # 模型训练中保存的中间结果,模型,日志等文件的主目录False 22 | 'do_lower_case': False, # 主要是tokenize时是否将大写转为小写 23 | 'cache_dir': '', # 指定下载的预训练模型保存地址 24 | 'evaluate_during_training': True, # 是否在训练过程中验证模型, 默认为True 25 | 'use_lstm': False, # 默认为False, 表示模型结构为bert_crf 26 | 'from_scratch': True, # 是否从头开始训练,默认为True 27 | 'from_last_checkpoint': False, # 是否从最新的checkpoint模型继续训练,默认为False 28 | 'early_stop': False, 29 | 'overwrite_output_dir': True, 30 | 'overwrite_cache': True, # 是否重写特征,默认为True,若为False表示从特征文件中加载特征 31 | 'no_cuda': False, # 是否使用GPU。默认为False, 表示只使用CPU 32 | 'fp16': True, 33 | 'train_max_seq_length': 32, # 默认为512 34 | 'eval_max_seq_length': 32, # 默认为512 35 | 'per_gpu_train_batch_size': 16, 36 | 'per_gpu_eval_batch_size': 16, 37 | 'gradient_accumulation_steps': 1, 38 | 'learning_rate': 5e-05, # bert和lstm的学习率 39 | 'crf_learning_rate': 5e-05, 40 | 'weight_decay': 0.01, 41 | 'adam_epsilon': 1e-08, 42 | 'warmup_proportion': 0.1, 43 | 'num_train_epochs': 3.0, 44 | 'max_steps': -1, # 当指定了该字段值后,'num_train_epochs'就不起作用了 45 | 'tolerance': 5, # 指定early stop容忍的epoch数量 46 | 'logging_steps': 500, # 指定tensorboard日志在哪个阶段记录 47 | 'save_steps': 500, # 指定哪些步骤保存中间训练结果 48 | # ["linear","cosine","cosine_with_restarts","polynomial","constant","constant_with_warmup"] 49 | 'scheduler_type': 'linear', 50 | 'cuda_number': '3', # '0,1,2,3' 使用GPU时需指定GPU卡号 51 | 'seed': 2333, 52 | 'dropout_rate': 0.3 53 | } 54 | 55 | args = DataAndTrainArguments(**config) # noqa 56 | extractor = EventExtractor(args) 57 | 58 | # training from scratch, set config['from_scratch'] = True 59 | extractor.train_and_valid() 60 | 61 | # continue train from 'model_sate_dict_path', set config['from_scratch'] = False 62 | # extractor.train_and_valid() 63 | 64 | # continue train from last checkpoint file, set config['from_scratch'] = False, config['from_last_checkpoint']=True. 65 | # And should rise the 'num_train_epochs' 66 | # extractor.train_and_valid() 67 | -------------------------------------------------------------------------------- /experiments/ner/lear_for_ner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: lear_for_ner 6 | Author: czh 7 | Create Date: 2022/3/11 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import os 13 | import sys 14 | import json 15 | from tqdm import tqdm 16 | import codecs 17 | from typing import List, Tuple 18 | sys.path.append("/data/chenzhihao/NLP") 19 | 20 | import numpy as np 21 | import torch 22 | import torch.nn as nn 23 | from torch.optim import AdamW 24 | from torch.utils.data import DataLoader, Dataset 25 | from transformers import BertConfig, BertTokenizerFast 26 | 27 | from nlp.models.bert_for_ner import LearForNer 28 | from nlp.tools.path import project_root_path 29 | -------------------------------------------------------------------------------- /experiments/ner/run_ner_tplinker_v2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: run_ner_tplinker_v2 6 | Author: czh 7 | Create Date: 2021/8/24 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import time 13 | 14 | from nlp.utils.tplinker_plus_utils import DataAndTrainArguments 15 | from nlp.models.tplinker_plus_for_ner import TPLinkerPlusForNER 16 | 17 | root_dir = "/data/chenzhihao/NLP/experiments" 18 | config = { 19 | "bert_name_or_path": "hfl/chinese-roberta-wwm-ext", 20 | "data_dir": root_dir + "/datas/tplinker", 21 | "task_name": "ner", 22 | "model_type": "BERT", 23 | "train_data_name": "train_data.json", 24 | "valid_data_name": "valid_data.json", 25 | "ent2id": "ent2id.json", 26 | "output_dir": root_dir + "/output_file_dir/tplinker_plus_ner_bert/train_results", 27 | "log_dir": root_dir + "/logs/tplinker_plus_ner.log", 28 | "tensorboard_log_dir": root_dir + "/tensorboard/tplinker_plus_ner/", 29 | "path_to_save_model": root_dir + "/output_file_dir/tplinker_plus_ner_bert/train_results/best_model", 30 | "model_state_dict_path": root_dir + "/output_file_dir/tplinker_plus_ner_bert/train_results/best_model", 31 | "save_res_dir": root_dir + "/output_file_dir/tplinker_plus_ner_bert/eval_results", 32 | "score": True, # set true only if test set is tagged 33 | "n_gpu": "0", 34 | "num_workers": 4, 35 | "logger": "default", 36 | "train_batch_size": 16, 37 | "eval_batch_size": 16, 38 | "epochs": 4, 39 | "fp16": True, 40 | "gradient_accumulation_steps": 1, 41 | "shaking_type": "cln_plus", 42 | "match_pattern": "whole_text", 43 | "inner_enc_type": "lstm", 44 | "f1_2_save": 0, 45 | "fr_scratch": True, 46 | "fr_last_checkpoint": False, 47 | "note": "start from scratch", 48 | "log_interval": 10, 49 | "max_seq_len": 512, 50 | "sliding_len": 20, 51 | "last_k_model": 1, 52 | "scheduler": "CAWR", # Step 53 | "ghm": False, 54 | "tok_pair_sample_rate": 1, 55 | "force_split": False, 56 | "lr": 5e-5, 57 | "T_mult": 1, 58 | "rewarm_epoch_num": 2, 59 | "save_steps": 500, 60 | "logging_steps": 500 61 | } 62 | 63 | start = time.time() 64 | args = DataAndTrainArguments(**config) 65 | # print(args.__dict__) 66 | trainer = TPLinkerPlusForNER(args) 67 | trainer.init_env() 68 | 69 | # training 70 | trainer.train_and_valid() 71 | print(time.time()-start) 72 | 73 | start = time.time() 74 | # evaluating 75 | trainer.evaluate() 76 | print(time.time()-start) 77 | 78 | # predicting 79 | start = time.time() 80 | text = "百炼智能是一家人工智能科技公司,公司CEO是冯是聪" 81 | trainer.init_others(len(text)+2) 82 | model = trainer.init_model(16) 83 | trainer.restore(model) 84 | test_data, ori_test_data, max_seq_len = trainer.process_predict_data(text, max_seq_len=len(text)+2) 85 | result = trainer.predict(test_data=test_data, 86 | ori_test_data=ori_test_data, 87 | model=model, 88 | max_seq_len=max_seq_len, 89 | batch_size=1) 90 | print(result) 91 | print(time.time()-start) 92 | -------------------------------------------------------------------------------- /experiments/relation_extraction/train_relation_extraction_by_globalpointer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: train_relation_extraction_by_globalpointer 6 | Author: czh 7 | Create Date: 2022/2/9 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import os 13 | import regex 14 | import glob 15 | import json 16 | import time 17 | from tqdm import tqdm 18 | import typing 19 | import codecs 20 | from collections import defaultdict 21 | 22 | import torch 23 | import torch.nn as nn 24 | from torch.optim import AdamW 25 | from torch.utils.data import DataLoader 26 | from transformers import BertConfig, BertTokenizerFast, HfArgumentParser, get_scheduler 27 | 28 | from nlp.models.bert_for_relation_extraction import GlobalPointerForRel 29 | 30 | -------------------------------------------------------------------------------- /experiments/scripts/decode_yunwen_unilm_for_se2seq.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318 3 | MODEL_PATH=/root/work2/work2/chenzhihao/pretrained_models/torch_unilm_model 4 | MODEL_RECOVER_PATH=$PROJECT_PATH/datas/output_dir/unilm/yunwen_unilm/seq2seq_on_natural_conv/ 5 | OUTPUT_FILE=$PROJECT_PATH/datas/output_dir/unilm/yunwen_unilm/seq2seq_on_natural_conv/predict_.json 6 | 7 | MODEL_TYPE="unilm" 8 | 9 | export CUDA_VISIBLE_DEVICES=7 10 | python $PROJECT_PATH/experiments/single_test/decode_yunwen_unilm_for_seq2seq.py \ 11 | --model_type=$MODEL_TYPE \ 12 | --model_name_or_path $MODEL_PATH \ 13 | --model_recover_path=$MODEL_RECOVER_PATH \ 14 | --input_file=$DATA_PATH/test.json \ 15 | --split="test" \ 16 | --max_seq_length=512 \ 17 | --do_lower_case \ 18 | --batch_size=32 \ 19 | --beam_size=5 \ 20 | --max_tgt_length=128 -------------------------------------------------------------------------------- /experiments/scripts/finetune_cpm_large_2.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CPM-natural_conv/ 4 | LOGGING_DIR=$OUTPUT_DIR/logs 5 | MODEL_PATH="/root/work2/work2/chenzhihao/pretrained_models/CPM-generate" 6 | 7 | PROJECT_NAME='nlp' 8 | EXPERIMENT_NAME='cpm1-natural_conv' 9 | GROUP_NAME='cpm1_generate' 10 | SPEAKER1="用户:" 11 | SPEAKER2="\n机器人:" 12 | 13 | #export CUDA_VISIBLE_DEVICES=2,3 14 | #accelerate config 15 | accelerate launch $PROJECT_PATH/experiments/qa_and_text_generation/finetune_cpm_large_2.py \ 16 | --pretrained \ 17 | --model_checkpoint=$MODEL_PATH \ 18 | --config_path=$MODEL_PATH/config.json \ 19 | --tokenizer_path=$MODEL_PATH \ 20 | --data_path=$DATA_PATH \ 21 | --data_type="natural_conv" \ 22 | --output_dir=$OUTPUT_DIR \ 23 | --logging_dir=$LOGGING_DIR \ 24 | --project_name=$PROJECT_NAME \ 25 | --experiment_name=$EXPERIMENT_NAME \ 26 | --group_name=$GROUP_NAME \ 27 | --speaker1=$SPEAKER1 \ 28 | --speaker2=$SPEAKER2 \ 29 | --scheduler="linear" \ 30 | --num_epochs=15 \ 31 | --do_train \ 32 | --do_valid \ 33 | --do_test \ 34 | --train_path="train.txt" \ 35 | --valid_path="dev.txt" \ 36 | --test_path="test.txt" \ 37 | --train_batch_size=4 \ 38 | --valid_batch_size=4 \ 39 | --lr=2e-5 \ 40 | --warmup_steps=2000 \ 41 | --valid_steps=500 \ 42 | --gradient_accumulation_steps=32 \ 43 | --local_rank=0 \ 44 | --mixed_precision='fp16' \ 45 | --seed=2333 \ 46 | --with_tracking \ 47 | --max_seq_length=512 \ 48 | --max_history=10 \ 49 | --do_sample \ 50 | --top_k=0 \ 51 | --top_p=0.9 \ 52 | --temperature=0.75 \ 53 | --output_max_length=256 \ 54 | --output_min_length=2 -------------------------------------------------------------------------------- /experiments/scripts/finetune_cpm_large_accelerate.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data2/work2/chenzhihao/NLP 2 | DATA_PATH=$PROJECT_PATH/datas/raw_datas/ 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CPM-large2/ 4 | LOGGING_DIR=$OUTPUT_DIR/logs 5 | MODEL_PATH="/data2/work2/chenzhihao/pretrained_models/CPM-generate" 6 | 7 | #export CUDA_VISIBLE_DEVICES=2,3 8 | #accelerate configs 9 | accelerate launch $PROJECT_PATH/examples/qa_and_text_generation/finetune_cpm_large_accelerate.py \ 10 | --pretrained \ 11 | --model_checkpoint=$MODEL_PATH \ 12 | --config_path=$MODEL_PATH/config.json \ 13 | --tokenizer_path=$MODEL_PATH \ 14 | --data_path=$DATA_PATH \ 15 | --output_dir=$OUTPUT_DIR \ 16 | --logging_dir=$LOGGING_DIR \ 17 | --scheduler="linear" \ 18 | --num_epochs=15 \ 19 | --train_batch_size=4 \ 20 | --valid_batch_size=4 \ 21 | --lr=2e-5 \ 22 | --warmup_steps=2000 \ 23 | --valid_steps=2000 \ 24 | --gradient_accumulation_steps=32 \ 25 | --local_rank=0 \ 26 | --mixed_precision='fp16' \ 27 | --seed=2333 \ 28 | --do_train \ 29 | --do_valid \ 30 | --do_test \ 31 | --with_tracking \ 32 | --max_seq_length=512 \ 33 | --do_sample \ 34 | --top_k=0 \ 35 | --top_p=0.0 \ 36 | --temperature=1.0 \ 37 | --output_max_length=256 \ 38 | --output_min_length=5 -------------------------------------------------------------------------------- /experiments/scripts/finetune_liadrinz_unilm.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318/corpus.txt 3 | MODEL_TYPE=unilm 4 | MODEL_NAME=/root/work2/work2/chenzhihao/pretrained_models/unilm-chinese-base 5 | #MODEL_NAME=peterchou/unilm-chinese-base 6 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/unilm/liadrinz_unilm/seq2seq_on_natural_conv 7 | 8 | export WANDB_DISABLED=true 9 | export CUDA_VISIBLE_DEVICES=7 10 | export OMP_NUM_THREADS=1 11 | python3 -u $PROJECT_PATH/experiments/qa_and_text_generation/finetune_unilm_for_seq2seq_liadrinz.py \ 12 | train \ 13 | --model_type ${MODEL_TYPE} \ 14 | --model_name_or_path ${MODEL_NAME} \ 15 | --batch_size 32 \ 16 | --corpus_file $DATA_PATH \ 17 | --max_seq_len 512 \ 18 | --seed 42 \ 19 | --output_dir ${OUTPUT_DIR} \ 20 | --gradient_accumulation_steps 2 \ 21 | --lr=2e-5 \ 22 | --num_train_epochs 5 \ 23 | --mask_prob 0.2 \ 24 | --local_rank=-1 \ 25 | --fp16 -------------------------------------------------------------------------------- /experiments/scripts/finetune_qwen_7b_qlora.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | ROOT_PATH=/home/aiteam/work2/NLP 4 | MODEL_NAME_OR_PATH=/home/aiteam/work/pretrained_models/Qwen-7B-Chat 5 | MODEL_TYPE=qwen-7b-chat #bloom,llama 6 | 7 | DATA_DIR=${ROOT_PATH}/datas/firefly 8 | OUTPUT_DIR=${ROOT_PATH}/datas/output_dir/${MODEL_TYPE}/instruct_qlora 9 | mkdir -p ${OUTPUT_DIR} 10 | 11 | CACHE_DIR=${ROOT_PATH}/datas/hf_cache_dir/${MODEL_TYPE}/instruct_qlora 12 | mkdir -p ${CACHE_DIR} 13 | 14 | CUTOFF_LEN=1024 15 | SYSTEM_PROMPT="" 16 | 17 | # V100不支持lora+fp16 18 | CUDA_VISIBLE_DEVICES="6" torchrun --nproc_per_node 1 ${ROOT_PATH}/experiments/qa_and_txt_generation/finetune_llm_chat.py \ 19 | --model_name_or_path ${MODEL_NAME_OR_PATH} \ 20 | --model_type ${MODEL_TYPE} \ 21 | --use_qlora True \ 22 | --bits 4 \ 23 | --lora_config ${ROOT_PATH}/config/lora_config_llama.json \ 24 | --train_file ${DATA_DIR}/train.json \ 25 | --validation_file ${DATA_DIR}/dev.json \ 26 | --chat_format 'chatml' \ 27 | --source_prefix "human" \ 28 | --target_prefix "assistant" \ 29 | --system_prompt $SYSTEM_PROMPT \ 30 | --per_device_train_batch_size 2 \ 31 | --per_device_eval_batch_size 2 \ 32 | --gradient_accumulation_steps 8 \ 33 | --num_train_epochs 3 \ 34 | --model_max_length ${CUTOFF_LEN} \ 35 | --save_strategy "steps" \ 36 | --save_steps 100 \ 37 | --learning_rate 8e-6 \ 38 | --weight_decay 0.00001 \ 39 | --warmup_ratio 0.05 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 50 \ 42 | --logging_strategy "steps" \ 43 | --evaluation_strategy "steps" \ 44 | --eval_steps 100 \ 45 | --fp16 True \ 46 | --seed 1234 \ 47 | --gradient_checkpointing True \ 48 | --cache_dir ${CACHE_DIR} \ 49 | --report_to "all" \ 50 | --output_dir ${OUTPUT_DIR} 51 | # --save_total_limit 5 \ 52 | # --metric_for_best_model "rouge-l" \ 53 | # --predict_with_generate True 54 | # --optim paged_adamw_32bit 55 | -------------------------------------------------------------------------------- /experiments/scripts/finetune_yunwen_unilm.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318 3 | MODEL_PATH=/root/work2/work2/chenzhihao/pretrained_models/torch_unilm_model 4 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/unilm/yunwen_unilm/seq2seq_on_natural_conv 5 | LOGGING_DIR=$OUTPUT_DIR/logs 6 | 7 | MODEL_TYPE="unilm" 8 | SOURCE_NAME="source" 9 | TARGET_NAME="target" 10 | 11 | export CUDA_VISIBLE_DEVICES=7 12 | python $PROJECT_PATH/experiments/qa_and_text_generation/finetune_unilm_for_seq2seq_yunwen.py \ 13 | --data_dir $DATA_PATH \ 14 | --model_type=$MODEL_TYPE \ 15 | --model_name_or_path $MODEL_PATH \ 16 | --output_dir $OUTPUT_DIR \ 17 | --log_dir $LOGGING_DIR \ 18 | --src_file="source.json" \ 19 | --source=$SOURCE_NAME \ 20 | --target=$TARGET_NAME \ 21 | --max_seq_length=512 \ 22 | --max_position_embeddings=512 \ 23 | --do_train \ 24 | --do_lower_case \ 25 | --train_batch_size=32 \ 26 | --learning_rate=1e-5 \ 27 | --num_train_epochs=10 \ 28 | --scheduler="linear" \ 29 | --local_rank=-1 \ 30 | --gradient_accumulation_steps=1 \ 31 | --seed=2333 \ 32 | --fp16 \ 33 | --fp16_opt_level='O1' 34 | 35 | -------------------------------------------------------------------------------- /experiments/scripts/run_child_tuning_on_ner.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir 6 | LOG_DIR=$CURRENT_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | TASK_NAME="cluener" 9 | MODEL_TYPE="bert" 10 | 11 | python /data/chenzhihao/NLP/experiments/others/child_tuning_on_ner.py \ 12 | --model_name_or_path=$BERT_BASE_DIR \ 13 | --model_type=$MODEL_TYPE \ 14 | --task_name=$TASK_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict_no_tag \ 18 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 19 | --max_seq_length=256 \ 20 | --per_gpu_train_batch_size=32 \ 21 | --per_gpu_eval_batch_size=32 \ 22 | --learning_rate=3e-5 \ 23 | --crf_learning_rate=3e-3 \ 24 | --num_train_epochs=30.0 \ 25 | --fp16 \ 26 | --fp16_backend=amp \ 27 | --gradient_accumulation_steps=1 \ 28 | --warmup_ratio=0.1 \ 29 | --logging_steps=500 \ 30 | --save_steps=500 \ 31 | --eval_steps=1000 \ 32 | --save_total_limit=10 \ 33 | --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ 34 | --logging_dir=$LOG_DIR/ \ 35 | --overwrite_output_dir \ 36 | --overwrite_cache \ 37 | --seed=42 \ 38 | --cuda_number=0 \ 39 | --markup=bios \ 40 | --reserve_p=0.3 \ 41 | --mode=ChildTuning-D \ 42 | --metric_for_best_model=f1 \ 43 | --greater_is_better -------------------------------------------------------------------------------- /experiments/scripts/run_duee1_sujianlin.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir 6 | LOG_DIR=$CURRENT_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | DATA_FORMAT='duee1' 9 | TASK_NAME="news2" 10 | MODEL_TYPE="bert" # bert, nezha, roformer 11 | CUDA_NUMBERS='1' # '0,1,2,3' 12 | SCHEDULER_TYPE='linear' # ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"] 13 | 14 | python $CURRENT_DIR/relation_extraction/train_ee.py \ 15 | --model_type=$MODEL_TYPE \ 16 | --model_name_or_path=$BERT_BASE_DIR \ 17 | --data_format=$DATA_FORMAT \ 18 | --task_name=$TASK_NAME \ 19 | --do_train \ 20 | --do_eval \ 21 | --evaluate_during_training \ 22 | --do_eval_per_epoch \ 23 | --do_predict_tag \ 24 | --do_eval_per_epoch \ 25 | --use_lstm \ 26 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 27 | --train_max_seq_length=128 \ 28 | --eval_max_seq_length=128 \ 29 | --per_gpu_train_batch_size=32 \ 30 | --per_gpu_eval_batch_size=32 \ 31 | --learning_rate=3e-5 \ 32 | --crf_learning_rate=2e-3 \ 33 | --num_train_epochs=30.0 \ 34 | --fp16 \ 35 | --local_rank -1 \ 36 | --gradient_accumulation_steps=1 \ 37 | --logging_steps=500 \ 38 | --save_steps=500 \ 39 | --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ 40 | --logging_dir=$LOG_DIR/ \ 41 | --overwrite_output_dir \ 42 | --overwrite_cache \ 43 | --seed=2333 \ 44 | --cuda_number=$CUDA_NUMBERS \ 45 | --scheduler_type=$SCHEDULER_TYPE -------------------------------------------------------------------------------- /experiments/scripts/run_finetune_cdail_gpt.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=$PROJECT_PATH/datas/raw_datas/ 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CDail-GPT-QA 4 | MODEL_CHECKPOINT=$PROJECT_PATH/datas/output_dir/CDail-GPT-QA 5 | 6 | python $PROJECT_PATH/experiments/qa_and_text_generation/finetune_cdail_gpt.py \ 7 | --pretrained \ 8 | --model_checkpoint=$MODEL_CHECKPOINT \ 9 | --data_path=$DATA_PATH \ 10 | --output_dir=$OUTPUT_DIR \ 11 | --scheduler="linear" \ 12 | --n_epochs=30 \ 13 | --train_batch_size=12 \ 14 | --valid_batch_size=12 \ 15 | --lr=5e-5 \ 16 | --warmup_steps=5000 \ 17 | --valid_steps=5000 \ 18 | --gradient_accumulation_steps=64 \ 19 | --local_rank=-1 \ 20 | --fp16='01' \ 21 | --fp16_backend='amp' \ 22 | --device='cuda:0' \ 23 | --do_train \ 24 | --do_valid -------------------------------------------------------------------------------- /experiments/scripts/run_finetune_cdail_gpt_2.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/腾讯对话NaturalConv_Release_20210318 3 | OUTPUT_DIR=$PROJECT_PATH/datas/output_dir/CDail-GPT 4 | MODEL_CHECKPOINT=/root/work2/work2/chenzhihao/pretrained_models/CDial-GPT_LCCC-large 5 | 6 | python $PROJECT_PATH/experiments/qa_and_text_generation/finetune_cdail_gpt_2.py \ 7 | --pretrained \ 8 | --model_checkpoint=$MODEL_CHECKPOINT \ 9 | --data_path=$DATA_PATH \ 10 | --data_type="natural_conv" \ 11 | --output_dir=$OUTPUT_DIR \ 12 | --scheduler="linear" \ 13 | --n_epochs=30 \ 14 | --max_history=10 \ 15 | --do_train \ 16 | --do_valid \ 17 | --do_test \ 18 | --train_path="train.txt" \ 19 | --valid_path="dev.txt" \ 20 | --test_path="test.txt" \ 21 | --train_batch_size=16 \ 22 | --valid_batch_size=16 \ 23 | --lr=5e-5 \ 24 | --warmup_steps=5000 \ 25 | --valid_steps=500 \ 26 | --gradient_accumulation_steps=1 \ 27 | --local_rank=-1 \ 28 | --fp16='01' \ 29 | --fp16_backend='amp' \ 30 | --device='cuda:1' -------------------------------------------------------------------------------- /experiments/scripts/run_global_pointer_for_ner.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$DATA_DIR/output_file_dir 6 | LOG_DIR=$DATA_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | TASK_NAME="cluener" 9 | MODEL_TYPE="bert" 10 | 11 | python $CURRENT_DIR/ner/train_globalpointer.py \ 12 | --model_type=$MODEL_TYPE \ 13 | --model_name_or_path=$BERT_BASE_DIR \ 14 | --task_name=$TASK_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict_no_tag \ 18 | --do_lower_case \ 19 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 20 | --max_seq_length=256 \ 21 | --sliding_len=100 \ 22 | --per_gpu_train_batch_size=64 \ 23 | --per_gpu_eval_batch_size=64 \ 24 | --learning_rate=2e-5 \ 25 | --num_train_epochs=100 \ 26 | --fp16 \ 27 | --fp16_backend=amp \ 28 | --warmup_ratio=0.1 \ 29 | --local_rank -1 \ 30 | --gradient_accumulation_steps=1 \ 31 | --logging_steps=500 \ 32 | --save_steps=500 \ 33 | --eval_steps=500 \ 34 | --save_total_limit=5 \ 35 | --output_dir=$OUTPUR_DIR/${TASK_NAME}/ \ 36 | --logging_dir=$LOG_DIR/ \ 37 | --overwrite_output_dir \ 38 | --overwrite_cache \ 39 | --seed=2333 \ 40 | --cuda_number=0 \ 41 | --dataloader_num_workers=2 \ 42 | --scheduler_type=linear \ 43 | --metric_for_best_model=f1 \ 44 | --greater_is_better \ 45 | --rope \ 46 | --reserve_p=0.3 \ 47 | --mode=ChildTuning-D -------------------------------------------------------------------------------- /experiments/scripts/run_ner_crf.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir 6 | LOG_DIR=$CURRENT_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | TASK_NAME="cluener" 9 | MODEL_TYPE="bert" # bert,nezha,albert,roformer 10 | 11 | python /data/chenzhihao/NLP/experiments/ner/run_ner_crf.py \ 12 | --model_type=$MODEL_TYPE \ 13 | --model_name_or_path=$BERT_BASE_DIR \ 14 | --task_name=$TASK_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict_no_tag \ 18 | --do_adv \ 19 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 20 | --max_seq_length=256 \ 21 | --per_gpu_train_batch_size=32 \ 22 | --per_gpu_eval_batch_size=32 \ 23 | --learning_rate=3e-5 \ 24 | --crf_learning_rate=2e-3 \ 25 | --num_train_epochs=30.0 \ 26 | --fp16 \ 27 | --fp16_backend=amp \ 28 | --warmup_ratio=0.1 \ 29 | --local_rank -1 \ 30 | --gradient_accumulation_steps=1 \ 31 | --logging_steps=500 \ 32 | --save_steps=500 \ 33 | --eval_steps=1000 \ 34 | --output_dir=$OUTPUR_DIR/${TASK_NAME}/ \ 35 | --logging_dir=$LOG_DIR/ \ 36 | --overwrite_output_dir \ 37 | --overwrite_cache \ 38 | --seed=42 \ 39 | --cuda_number=0 \ 40 | --markup=bios \ 41 | --metric_for_best_model=f1 \ 42 | --greater_is_better -------------------------------------------------------------------------------- /experiments/scripts/run_ner_softmax.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir 6 | LOG_DIR=$CURRENT_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | TASK_NAME="cluener" 9 | MODEL_TYPE="bert" # bert,nezha,albert 10 | 11 | python /data/chenzhihao/NLP/experiments/ner/run_ner_softmax.py \ 12 | --model_type=$MODEL_TYPE \ 13 | --model_name_or_path=$BERT_BASE_DIR \ 14 | --task_name=$TASK_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict_no_tag \ 18 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 19 | --max_seq_length=256 \ 20 | --per_gpu_train_batch_size=32 \ 21 | --per_gpu_eval_batch_size=32 \ 22 | --learning_rate=3e-5 \ 23 | --crf_learning_rate=2e-3 \ 24 | --num_train_epochs=30.0 \ 25 | --fp16 \ 26 | --fp16_backend=amp \ 27 | --warmup_ratio=0.1 \ 28 | --local_rank -1 \ 29 | --gradient_accumulation_steps=1 \ 30 | --logging_steps=500 \ 31 | --save_steps=500 \ 32 | --eval_steps=1000 \ 33 | --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ 34 | --logging_dir=$LOG_DIR/ \ 35 | --overwrite_output_dir \ 36 | --overwrite_cache \ 37 | --seed=42 \ 38 | --cuda_number=0 \ 39 | --markup=bios \ 40 | --metric_for_best_model=f1 \ 41 | --greater_is_better -------------------------------------------------------------------------------- /experiments/scripts/run_ner_span.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir 6 | LOG_DIR=$CURRENT_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | #BERT_BASE_DIR="hfl/chinese-roberta-wwm-ext" 9 | TASK_NAME="cluener" 10 | MODEL_TYPE="bert" # bert,nezha,albert 11 | 12 | python /data/chenzhihao/NLP/experiments/ner/run_ner_span.py \ 13 | --model_type=$MODEL_TYPE \ 14 | --model_name_or_path=$BERT_BASE_DIR \ 15 | --task_name=$TASK_NAME \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_predict_no_tag \ 19 | --do_adv \ 20 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 21 | --max_seq_length=256 \ 22 | --per_gpu_train_batch_size=32 \ 23 | --per_gpu_eval_batch_size=32 \ 24 | --learning_rate=3e-5 \ 25 | --crf_learning_rate=2e-3 \ 26 | --num_train_epochs=30.0 \ 27 | --fp16 \ 28 | --fp16_backend=amp \ 29 | --warmup_ratio=0.1 \ 30 | --local_rank -1 \ 31 | --gradient_accumulation_steps=1 \ 32 | --logging_steps=500 \ 33 | --save_steps=500 \ 34 | --eval_steps=1000 \ 35 | --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ 36 | --logging_dir=$LOG_DIR/ \ 37 | --overwrite_output_dir \ 38 | --overwrite_cache \ 39 | --seed=42 \ 40 | --cuda_number=0 \ 41 | --markup=bios \ 42 | --metric_for_best_model=f1 \ 43 | --greater_is_better -------------------------------------------------------------------------------- /experiments/scripts/run_pair_sup_con.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/chinese-semantics-match-dataset/ 3 | OUTPUT_DIR=$PROJECT_PATH/experiments/output_file_dir/semantic_match 4 | MODEL_PATH="/root/work2/work2/chenzhihao/pretrained_models/chinese-roberta-wwm-ext" 5 | 6 | PROJECT_NAME='semantic_match' 7 | EXPERIMENT_NAME='atec-pairsupcon-roberta-wwm-ext' 8 | GROUP_NAME='nlp' 9 | MODEL_TYPE='roberta-wwm-ext' 10 | DATA_TYPE='ATEC' 11 | 12 | 13 | python $PROJECT_PATH/experiments/sentence_embedding/run_pair_sup_con.py \ 14 | --model_type=$MODEL_TYPE \ 15 | --model_name_or_path=$MODEL_PATH \ 16 | --output_dir=$OUTPUT_DIR \ 17 | --project_name=$PROJECT_NAME \ 18 | --group_name=$GROUP_NAME \ 19 | --experiment_name=$EXPERIMENT_NAME \ 20 | --data_dir=$DATA_PATH \ 21 | --data_type=$DATA_TYPE \ 22 | --do_train \ 23 | --do_valid \ 24 | --do_test \ 25 | --train_dataset='train.txt' \ 26 | --valid_dataset='dev.txt' \ 27 | --test_dataset='test.txt' \ 28 | --max_seq_length=128 \ 29 | --lr_rate=5e-05 \ 30 | --lr_scale 100 \ 31 | --gradient_accumulation_steps=1 \ 32 | --scheduler_type='linear' \ 33 | --train_batch_size=64 \ 34 | --valid_batch_size=64 \ 35 | --num_train_epochs=200 \ 36 | --gpuid=0 \ 37 | --seed=2333 \ 38 | --num_worker=0 \ 39 | --num_labels=2 \ 40 | --temperature=0.05 \ 41 | --task_type='pairsupcon' \ 42 | --contrast_type="HardNeg" \ 43 | --beta=1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /experiments/scripts/run_sbert_training_dynamics.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=/root/work2/work2/chenzhihao/datasets/chinese-semantics-match-dataset/ 5 | OUTPUT_DIR=$CURRENT_DIR/output_file_dir/semantic_match 6 | LOG_PATH=$OUTPUT_DIR/dy_logs/ 7 | 8 | MODLE_TYPE='roberta-wwm' 9 | MODLE_NAME_OR_PATH=/root/work2/work2/chenzhihao/pretrained_models/chinese-roberta-wwm-ext 10 | DATA_TYPE='BQ' 11 | OBJECT_TYPE='classification' 12 | TASK_TYPE='match' 13 | POOLING_STRATEGY='first-last-avg' 14 | PROJECT_NAME='sup-sbert' 15 | EXPERIMENT_NAME='sbert-training-dynamics' 16 | GROUP_NAME='semantic_match' 17 | 18 | python $CURRENT_DIR/sentence_embedding/train_sentence_bert_training_dynamics.py \ 19 | --model_type $MODLE_TYPE \ 20 | --model_name_or_path $MODLE_NAME_OR_PATH \ 21 | --data_dir $DATA_DIR \ 22 | --output_dir $OUTPUT_DIR \ 23 | --dy_log_path $LOG_PATH \ 24 | --data_type $DATA_TYPE \ 25 | --task_type $TASK_TYPE \ 26 | --object_type $OBJECT_TYPE \ 27 | --pooling_strategy $POOLING_STRATEGY \ 28 | --project_name $PROJECT_NAME \ 29 | --experiment_name $EXPERIMENT_NAME \ 30 | --group_name $GROUP_NAME \ 31 | --do_train \ 32 | --do_valid \ 33 | --do_test \ 34 | --do_recording \ 35 | --max_seq_length=128 \ 36 | --num_train_epochs=32 \ 37 | --valid_batch_size=32 \ 38 | --test_batch_size=32 \ 39 | --valid_steps=500 \ 40 | --num_labels=2 \ 41 | --lr_rate=2e-5 \ 42 | --gradient_accumulation_steps=1 \ 43 | --scheduler_type='linear' \ 44 | --num_workers=0 \ 45 | --cuda_number=7 46 | -------------------------------------------------------------------------------- /experiments/scripts/run_spn4re.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/data/chenzhihao/NLP 2 | export PYTHONPATH=$PYTHONPATH:$PROJECT_PATH/ 3 | CURRENT_DIR=$PROJECT_PATH/experiments 4 | DATA_DIR=$PROJECT_PATH/datas 5 | OUTPUR_DIR=$CURRENT_DIR/output_file_dir 6 | LOG_DIR=$CURRENT_DIR/logs 7 | BERT_BASE_DIR=/data/chenzhihao/chinese-roberta-ext 8 | TASK_NAME="spnre_business_chance" 9 | MODEL_TYPE="bert" # bert,nezha,albert,roformer 10 | 11 | python /data/chenzhihao/NLP/experiments/relation_extraction/train_spn4re.py \ 12 | --model_type=$MODEL_TYPE \ 13 | --model_name_or_path=$BERT_BASE_DIR \ 14 | --task_name=$TASK_NAME \ 15 | --do_train \ 16 | --do_eval \ 17 | --do_predict_no_tag \ 18 | --data_dir=$DATA_DIR/${TASK_NAME}/ \ 19 | --max_seq_length=512 \ 20 | --per_gpu_train_batch_size=16 \ 21 | --per_gpu_eval_batch_size=16 \ 22 | --learning_rate=3e-5 \ 23 | --crf_learning_rate=2e-3 \ 24 | --num_train_epochs=100.0 \ 25 | --fp16 \ 26 | --fp16_backend=amp \ 27 | --warmup_ratio=0.1 \ 28 | --local_rank -1 \ 29 | --gradient_accumulation_steps=1 \ 30 | --logging_steps=500 \ 31 | --save_steps=500 \ 32 | --eval_steps=500 \ 33 | --output_dir=$OUTPUR_DIR/${TASK_NAME}/ \ 34 | --logging_dir=$LOG_DIR/ \ 35 | --overwrite_output_dir \ 36 | --overwrite_cache \ 37 | --seed=2333 \ 38 | --cuda_number=0 \ 39 | --markup=bios \ 40 | --metric_for_best_model=f1 \ 41 | --greater_is_better \ 42 | --sliding_len=20 \ 43 | --relation_labels="BUSEXP,ORGFIN,PERUP,STRCOO" \ 44 | --num_generated_tuples=10 \ 45 | --num_entities_in_tuple=8 \ 46 | --allow_null_entities_in_tuple="0,0,1,1,1,1,1,1" \ 47 | --entity_loss_weight="2,2,2,2,2,2,2,2" 48 | -------------------------------------------------------------------------------- /experiments/scripts/run_unsup_vascl.sh: -------------------------------------------------------------------------------- 1 | PROJECT_PATH=/root/work2/work2/chenzhihao/NLP 2 | DATA_PATH=/root/work2/work2/chenzhihao/datasets/chinese-semantics-match-dataset/ 3 | OUTPUT_DIR=$PROJECT_PATH/experiments/output_file_dir/semantic_match 4 | MODEL_PATH="/root/work2/work2/chenzhihao/pretrained_models/chinese-roberta-wwm-ext" 5 | 6 | PROJECT_NAME='semantic_match' 7 | EXPERIMENT_NAME='sts-b-unsup_vascl-roberta-wwm-ext' 8 | GROUP_NAME='nlp' 9 | MODEL_TYPE='roberta-wwm-ext' 10 | DATA_TYPE='STS-B' 11 | 12 | python $PROJECT_PATH/experiments/sentence_embedding/run_unsup_vascl.py \ 13 | --model_type=$MODEL_TYPE \ 14 | --model_name_or_path=$MODEL_PATH \ 15 | --output_dir=$OUTPUT_DIR \ 16 | --project_name=$PROJECT_NAME \ 17 | --group_name=$GROUP_NAME \ 18 | --experiment_name=$EXPERIMENT_NAME \ 19 | --data_dir=$DATA_PATH \ 20 | --data_type=$DATA_TYPE \ 21 | --do_train \ 22 | --do_valid \ 23 | --do_test \ 24 | --max_seq_length=64 \ 25 | --lr_rate=2e-5 \ 26 | --lr_scale=1000 \ 27 | --gradient_accumulation_steps=1 \ 28 | --scheduler_type='linear' \ 29 | --train_batch_size=256 \ 30 | --valid_batch_size=128 \ 31 | --num_train_epochs=100 \ 32 | --gpuid=2 \ 33 | --seed=2333 \ 34 | --num_worker=0 \ 35 | --temperature=0.05 \ 36 | --topk=16 \ 37 | --eps=15 38 | -------------------------------------------------------------------------------- /experiments/single_test/argument_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: argument_test 6 | Author: czh 7 | Create Date: 2021/11/11 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from dataclasses import dataclass, field 13 | from transformers import HfArgumentParser 14 | 15 | from nlp.arguments import TrainingArguments, ModelArguments, DataArguments 16 | 17 | 18 | @dataclass 19 | class MyArgument: 20 | early_stop: bool = field(default=False) 21 | patience: int = field(default=5, metadata={"help": "早停的轮数"}) 22 | 23 | 24 | parser = HfArgumentParser((DataArguments, ModelArguments, TrainingArguments, MyArgument)) 25 | parser.print_help() 26 | args = parser.parse_args() 27 | # print(args.patience) 28 | args.device = 'cpu' 29 | print(args.device) 30 | -------------------------------------------------------------------------------- /experiments/single_test/bart_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: bart_test 6 | Author: czh 7 | Create Date: 2022/1/25 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import torch 13 | from transformers import MBart50TokenizerFast,MBartForConditionalGeneration 14 | 15 | device = torch.device('cpu') 16 | long_texts = """《"十四五"现代综合交通运输体系发展规划》(以下简称《规划》)日前向社会公布。《规划》明确,到2025年,综合交通运输基本实现一体化融合发展,智能化、绿色化取得实质性突破,综合能力、服务品质、运行效率和整体效益显著提升,交通运输发展向世界一流水平迈进。 17 | 展望2035年,"全国123出行交通圈"(都市区1小时通勤、城市群2小时通达、全国主要城市3小时覆盖)和"全球123快货物流圈"(快货国内1天送达、周边国家2天送达、全球主要城市3天送达)基本形成,基本建成交通强国。 18 | 交通建设更提升 19 | "十四五"时期,将新增铁路营业里程1.9万公里、公路通车里程30.2万公里,民用运输机场将超270个 20 | 交通运输是国民经济中的基础性、先导性、战略性产业,是重要的服务性行业和现代化经济体系的重要组成部分,是构建新发展格局的重要支撑和服务人民美好生活、促进共同富裕的坚实保障。 21 | "十三五"时期,我国综合交通网络总里程突破600万公里,"十纵十横"综合运输大通道基本贯通,高速铁路运营里程翻一番,高速公路对20万人口以上城市覆盖率超过98%,民用运输机场覆盖92%左右的地级市,超大特大城市轨道交通加快成网……综合交通运输体系建设取得了历史性成就。 22 | "‘十四五’时期是加快推进交通强国建设、构建现代综合交通运输体系的关键五年。"国家发展改革委基础设施发展司司长罗国三说。 23 | 《规划》明确,"十四五"时期,我国将新增铁路营业里程1.9万公里、公路通车里程30.2万公里、内河高等级航道里程2400公里,民用运输机场达到270个以上,城市轨道交通运营里程达到1万公里左右,高速铁路网对50万人口以上城市覆盖率达到95%以上,普速铁路瓶颈路段基本消除,"71118"国家高速公路主线基本贯通,现代化机场体系基本形成,综合立体交通网的规模、能力、质量和运行效率持续提升。 24 | "现代化高质量综合立体交通网是经济社会发展的基础支撑。《规划》提出,要构建完善以‘十纵十横’综合运输大通道为骨干,以综合交通枢纽为支点,以快速网、干线网、基础网多层次网络为依托的综合交通网络,勾画好美丽中国‘交通工笔画’。"罗国三说。 25 | 为完善综合运输大通道,《规划》提出,打造沿海、沿江、沿边、出疆、入藏和西部陆海新通道等6条战略骨干通道,建设多层级一体化综合交通枢纽。 26 | 百姓出行更便利 27 | 着力填补西部铁路"留白",推动西部地区普通国道二级及以上公路比重达70% 28 | 人民交通为人民。"十四五"时期,百姓出行有望享受更多新便利。 29 | 均衡性提升。根据《规划》,将推动西部地区普通国道二级及以上公路比重达70%,推动较大人口规模自然村(组)通硬化路比例超过85%;加快城乡客运一体化发展,持续巩固拓展具备条件的乡镇和建制村通客车成果;进一步改善轮渡通行条件,方便边远地区群众日常出行;提升农村物流服务水平,到2025年,在全国推广100个左右农村物流服务品牌。 30 | 西部铁路"留白"也将被着力填补。国铁集团发展改革部副主任赵长江介绍,"十四五"期间,将统筹推进中西部地区铁路网建设,加强边疆地区铁路网建设,提高革命老区、民族地区和欠发达地区铁路网络密度。将着力构建多向入藏通道、完善出疆对外通道,加快推动新藏铁路前期工作,创造条件启动重点路段建设。推进青藏铁路升级改造,开展滇藏铁路重点路段前期研究。推动疆内铁路环起来,进出疆铁路畅起来。提高重点边境地区铁路覆盖,提升东北边境地区路网质量,完善西南边境地区路网布局。 31 | 便利性提升。《规划》提出,要完善枢纽集疏运体系,到2025年,实现沿海港口重要港区铁路进港率超过70%,枢纽机场轨道交通接入率达80%。加快发展旅客联程运输,将在50个城市组织开展旅客联程运输试点。普及道路客运电子客票应用,努力推进一站购票、一票通行。 32 | "预计2025年底,全国铁路营业里程将达16.5万公里左右,其中高速铁路(含部分城际铁路)5万公里左右、覆盖95%以上的50万人口以上城市,基本形成‘全国123高铁出行圈’,更好满足人们美好出行需要。"赵长江说。 33 | 经济性提升。《规划》指出,要持续优化运输结构,提高综合交通运输网络效率。规范交通运输新业态、新模式价格管理,健全巡游出租汽车价格形成机制,深化道路客运价格市场化改革。落实物流减税降费措施。 34 | 多样性提升。《规划》要求,引导和规范网约车、共享单车、汽车分时租赁健康发展。加快运输旅游融合发展,鼓励道路客运站拓展旅游集散服务功能。""" 35 | 36 | tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") 37 | model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") 38 | model.to(device) 39 | 40 | article_input_ids = tokenizer.batch_encode_plus([long_texts], return_tensors='pt', 41 | max_length=1024)['input_ids'].to(device) 42 | summary_ids = model.generate(article_input_ids, num_beams=4, length_penalty=2.0, max_length=142, min_length=56, 43 | no_repeat_ngram_size=3) 44 | 45 | summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) 46 | print(summary_txt) 47 | -------------------------------------------------------------------------------- /experiments/single_test/bert_crop_model_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: bert_crop_model_test 6 | Author: czh 7 | Create Date: 2021/8/10 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from transformers import BertConfig, BertPreTrainedModel, BertModel 13 | from nlp.models.bertcrop import BertCropModel, save_specify_num_hidden_layers_state 14 | 15 | 16 | bert_model_path = "/Users/czh/Downloads/chinese_bert_wwm" 17 | bert_config = BertConfig.from_json_file(bert_model_path+'/config.json') 18 | bert_config.num_hidden_layers = 1 19 | 20 | # state_dict = torch.load(bert_model_path+'/pytorch_model.bin') 21 | # bert_model.init_from_pretrained(state_dict) 22 | 23 | 24 | class MyModel(BertPreTrainedModel): 25 | def __init__(self, config, pretrained_bert_path): 26 | super(MyModel, self).__init__(config) 27 | 28 | self.bert = BertCropModel(config) 29 | # state_dict = torch.load(pretrained_bert_path+'/pytorch_model.bin') 30 | # init_from_pretrained(self.bert, state_dict, True) 31 | 32 | 33 | bert_model_ = BertModel.from_pretrained(bert_model_path) 34 | save_specify_num_hidden_layers_state(bert_model_, [1], "./pytorch_model_0_layer.bin") 35 | 36 | bert_model = MyModel.from_pretrained("./pytorch_model_0_layer.bin", config=bert_config, pretrained_bert_path=bert_model_path) 37 | 38 | for n, p in bert_model.named_parameters(): 39 | print(n) 40 | -------------------------------------------------------------------------------- /experiments/single_test/chatglm_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | -------------------------------- 5 | Author:czh 6 | date:2023/3/24 7 | -------------------------------- 8 | """ 9 | from transformers import AutoModel, AutoTokenizer 10 | 11 | device = "cuda:3" 12 | model_path = "/root/work2/work2/chenzhihao/pretrained_models/chatglm-6b" 13 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, revision=True) 14 | # model = AutoModel.from_pretrained(model_path, trust_remote_code=True, revision=True).half().to('mps') 15 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True, revision=True).half().to(device) 16 | model = model.eval() 17 | 18 | history = [] 19 | while True: 20 | query = input("\nuser(q to stop): ") 21 | if query.strip() == 'q': 22 | break 23 | 24 | response, history = model.chat(tokenizer, query.strip(), history=history) 25 | print("\nresponse: ", response) 26 | -------------------------------------------------------------------------------- /experiments/single_test/data_selection_for_training_dynamics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: zhihao.chen@kuwo.cn 6 | @date: 2022/9/21 19:22 7 | """ 8 | # 根据metric来选择不同的子训练集 9 | # 参考自:https://github.com/beyondguo/TrainingDynamics/blob/master/data_selection.py 10 | # Only applied to training set 11 | # python data_selection.py --task_name qnli --model_name bert-base-cased --proportion 0.5 --burn_out 4 12 | import json 13 | import random 14 | 15 | random.seed(1) 16 | import argparse 17 | 18 | from train_dynamics_filtering import read_training_dynamics, compute_train_dy_metrics 19 | 20 | 21 | class Config: 22 | task_name = 'BQ' 23 | model_name = "roberta-wwm" 24 | proportion = 0.33 25 | burn_out = 5 26 | 27 | 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--task_name", type=str) 30 | parser.add_argument("--model_name", type=str) 31 | parser.add_argument("--proportion", type=float, default=0.33) 32 | parser.add_argument("--burn_out", type=int) 33 | # args = parser.parse_args() 34 | args = Config() 35 | 36 | TASK_NAME = args.task_name 37 | MODEL = args.model_name 38 | PROPORTION = args.proportion 39 | LOG_PATH = '/root/work2/work2/chenzhihao/NLP/output_file_dir/semantic_match' 40 | 41 | # 读取并合并到一个文件 42 | td = read_training_dynamics(LOG_PATH + f'dy_logs/{TASK_NAME}/{MODEL}/') 43 | # 计算 metrics,转化成一个 dataframe 44 | td_df, _ = compute_train_dy_metrics(td, burn_out=args.burn_out) 45 | 46 | 47 | def consider_ascending_order(filtering_metric: str) -> bool: 48 | """ 49 | Determine if the metric values' sorting order to get the most `valuable` examples for training. 50 | """ 51 | if filtering_metric == "variability": 52 | return False 53 | elif filtering_metric == "confidence": 54 | return True 55 | elif filtering_metric == "threshold_closeness": 56 | return False 57 | elif filtering_metric == "forgetfulness": 58 | return False 59 | elif filtering_metric == "correctness": 60 | return True 61 | else: 62 | raise NotImplementedError(f"Filtering based on {filtering_metric} not implemented!") 63 | 64 | 65 | def data_selection(metric, select_worst, proportion, shuffle=True): 66 | ascending = consider_ascending_order(metric) 67 | if select_worst: 68 | ascending = not consider_ascending_order(metric) 69 | sorted_df = td_df.sort_values(by=metric, ascending=ascending) 70 | selected_df = sorted_df.head(n=int(proportion * len(sorted_df))) 71 | indices = list(selected_df['guid']) 72 | if shuffle: 73 | random.shuffle(indices) 74 | return {'indices': indices, 'df': selected_df} 75 | 76 | 77 | """ 78 | 选择hard-to-learn的数据,设置METRIC = 'confidence' 79 | 选择easy-to-learn的数据,设置METRIC = 'confidence', SELECT_WORST = True 80 | 选择ambiguoug的数据,设置METRIC = 'variability' 81 | """ 82 | 83 | three_regions_data_indices = {'hard': data_selection('confidence', False, PROPORTION)['indices'], 84 | 'easy': data_selection('confidence', True, PROPORTION)['indices'], 85 | 'ambiguous': data_selection('variability', False, PROPORTION)['indices']} 86 | 87 | with open(LOG_PATH + f'dy_logs/{TASK_NAME}/{MODEL}/three_regions_data_indices.json', 'w', encoding='utf8') as f: 88 | f.write(json.dumps(three_regions_data_indices, ensure_ascii=False)) 89 | 90 | # 然后可以直接跑glue任务,在选择训练集的时候,使用select函数来指定对应样本即可: 91 | """ e.g. 92 | from datasets import load_dataset 93 | raw_datasets = load_dataset('glue','sst2') 94 | easy_train_set = raw_datasets['train'].select(three_regions_data_indices['easy']) 95 | """ 96 | -------------------------------------------------------------------------------- /experiments/single_test/imagen_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: zhihao.chen@kuwo.cn 6 | @date: 2022/6/17 14:53 7 | """ 8 | import torch 9 | from imagen_pytorch import Unet, Imagen 10 | 11 | device = torch.device('cpu') 12 | # unet for imagen 13 | 14 | unet1 = Unet( 15 | dim=32, 16 | cond_dim=512, 17 | dim_mults=(1, 2, 4, 8), 18 | num_resnet_blocks=3, 19 | layer_attns=(False, True, True, True), 20 | layer_cross_attns=(False, True, True, True) 21 | ) 22 | 23 | unet2 = Unet( 24 | dim=32, 25 | cond_dim=512, 26 | dim_mults=(1, 2, 4, 8), 27 | num_resnet_blocks=(2, 4, 8, 8), 28 | layer_attns=(False, False, False, True), 29 | layer_cross_attns=(False, False, False, True) 30 | ) 31 | 32 | # imagen, which contains the unets above (base unet and super resoluting ones) 33 | 34 | imagen = Imagen( 35 | unets=(unet1, unet2), 36 | image_sizes=(64, 256), 37 | timesteps=1000, 38 | cond_drop_prob=0.1 39 | ).to(device) 40 | 41 | # mock images (get a lot of this) and text encodings from large T5 42 | 43 | text_embeds = torch.randn(4, 256, 768).to(device) 44 | text_masks = torch.ones(4, 256).bool().to(device) 45 | images = torch.randn(4, 3, 256, 256).to(device) 46 | 47 | # feed images into imagen, training each unet in the cascade 48 | 49 | for i in (1, 2): 50 | loss = imagen(images, text_embeds=text_embeds, text_masks=text_masks, unet_number=i) 51 | loss.backward() 52 | 53 | # do the above for many many many many steps 54 | # now you can sample an image based on the text embeddings from the cascading ddpm 55 | 56 | images = imagen.sample(texts=[ 57 | 'a whale breaching from afar', 58 | 'young girl blowing out candles on her birthday cake', 59 | 'fireworks with blue and green sparkles' 60 | ], cond_scale=3.) 61 | 62 | print(images.shape) # (3, 3, 256, 256) 63 | -------------------------------------------------------------------------------- /experiments/single_test/inference_liadrinz_unilm_for_seq2seq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2023/2/7 18:29 7 | """ 8 | import os 9 | import sys 10 | 11 | import torch 12 | dirname = os.path.dirname(os.path.abspath(__file__)) 13 | print(dirname) 14 | sys.path.append(os.path.join('/'.join(dirname.split('/')[:-2]))) 15 | from nlp.models.unilm_model_liadrinz import UniLMForConditionalGeneration 16 | from nlp.processors.unilm_liadrinz_processor import DataCollatorForUniLMSeq2Seq, CorpusDataset 17 | from nlp.utils.tokenization_unilm import UniLMTokenizerLiadrinz 18 | 19 | project_path = "/root/work2/work2/chenzhihao/NLP/" 20 | model_name_or_path = project_path + "datas/output_dir/unilm/liadrinz_unilm/seq2seq_on_natural_conv/checkpoint-1500" 21 | device = "cuda:7" 22 | TOP_K = 0 23 | TOP_P = 0.9 24 | TEMPERATURE = 0.7 25 | DO_SAMPLE = True 26 | OUTPUT_MAX_LENGTH = 32 27 | OUTPUT_MIN_LENGTH = 1 28 | PREFIX = "用户:" 29 | POSTFIX = " 机器人:" 30 | 31 | 32 | def interact(tokenizer: UniLMTokenizerLiadrinz, model: UniLMForConditionalGeneration): 33 | history = "" 34 | while True: 35 | raw_text = input("\n输入:") 36 | while not raw_text: 37 | print('Prompt should not be empty!') 38 | raw_text = input("\n输入:") 39 | raw_text = raw_text.strip() 40 | if raw_text == "stop": 41 | break 42 | history += PREFIX + raw_text + "。" + POSTFIX 43 | inputs = tokenizer(history, return_tensors='pt') 44 | for k in inputs: 45 | inputs[k] = inputs[k].to(device) 46 | with torch.no_grad(): 47 | if DO_SAMPLE: 48 | output_ids = model.generate(**inputs, 49 | max_new_tokens=OUTPUT_MAX_LENGTH, 50 | min_length=OUTPUT_MIN_LENGTH, 51 | top_k=TOP_K, 52 | top_p=TOP_P, 53 | temperature=TEMPERATURE, 54 | do_sample=True, 55 | no_repeat_ngram_size=3) 56 | else: 57 | output_ids = model.generate(**inputs, max_new_tokens=OUTPUT_MAX_LENGTH, num_beams=1, length_penalty=0.6) 58 | output_text = tokenizer.decode(output_ids[0]) 59 | result = output_text.split("[SEP]")[1].strip() 60 | print(result) 61 | result = "".join(result.split()) 62 | result = result.split(PREFIX.replace(":", ":"))[0] 63 | print("\n回复:", result) 64 | history += result 65 | print(history) 66 | if len(history) > 512: 67 | history = "" 68 | 69 | 70 | def main(): 71 | tokenizer = UniLMTokenizerLiadrinz.from_pretrained(model_name_or_path) 72 | model = UniLMForConditionalGeneration.from_pretrained(model_name_or_path) 73 | model.to(device) 74 | 75 | interact(tokenizer, model) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /experiments/single_test/paddlenlp_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: zhihao.chen@kuwo.cn 6 | @date: 2022/6/7 19:27 7 | """ 8 | from pprint import pprint 9 | from paddlenlp import Taskflow 10 | 11 | schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction 12 | ie = Taskflow('information_extraction', schema=schema) 13 | pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!")) 14 | -------------------------------------------------------------------------------- /experiments/single_test/position_embedding_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: position_embedding_test 6 | Author: czh 7 | Create Date: 2021/8/6 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from transformers import BertTokenizerFast 13 | from nlp.models.model_util import _generate_relative_positions_embeddings 14 | 15 | bert_model_name_or_path = "/data/chenzhihao/chinese-roberta-ext" 16 | tokenizer = BertTokenizerFast.from_pretrained(bert_model_name_or_path) 17 | text = ["俄罗斯卫星网刚刚消息称,美军在喀布尔机场向阿富汗平民开火,已致数人死亡。"] 18 | ids = tokenizer.batch_encode_plus(text, return_tensors='pt', max_length=128, padding="max_length") 19 | # print(ids) 20 | input_ids = ids["input_ids"] 21 | token_type_ids = ids["token_type_ids"] 22 | print(token_type_ids) 23 | 24 | # embedding = PositionEmbedding(128, 768, merge_mode='zero', hierarchical=True, embeddings_initializer='xavier_uniform') 25 | # embedding = SinusoidalPositionEmbedding(output_dim=768, merge_mode='zero') 26 | # embedding = RoFormerSinusoidalPositionalEmbedding(128, 768) 27 | # embedding = RelativePositionEmbedding(128*2+1, 768) 28 | # embedding = RelativePositionEmbeddingT5(input_dim=128*2+1, output_dim=768) 29 | # pos = embedding(input_ids) 30 | 31 | # lm = LM_Mask() 32 | # pos = lm.lm_mask(64, 64) 33 | 34 | # ulm = UniLM_Mask() 35 | # pos = ulm.unilm_mask(token_type_ids, 128-token_type_ids.size(1)) 36 | 37 | pos = _generate_relative_positions_embeddings(seq_length=128, embed_dim=64, max_relative_position=128) 38 | print(pos) 39 | print(pos.size()) 40 | 41 | -------------------------------------------------------------------------------- /experiments/single_test/roformer_sim_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/11/18 16:32 7 | """ 8 | # https://github.com/JunnYu/RoFormer_pytorch 9 | import torch 10 | import numpy as np 11 | from roformer import RoFormerForCausalLM, RoFormerConfig 12 | from transformers import BertTokenizer 13 | 14 | 15 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 16 | # 可选以下几个。 17 | # junnyu/roformer_chinese_sim_char_small, junnyu/roformer_chinese_sim_char_base 18 | # junnyu/roformer_chinese_sim_char_ft_small, roformer_chinese_sim_char_ft_base 19 | pretrained_model = "junnyu/roformer_chinese_sim_char_base" 20 | tokenizer = BertTokenizer.from_pretrained(pretrained_model) 21 | config = RoFormerConfig.from_pretrained(pretrained_model) 22 | config.is_decoder = True 23 | config.eos_token_id = tokenizer.sep_token_id 24 | config.pooler_activation = "linear" 25 | model = RoFormerForCausalLM.from_pretrained(pretrained_model, config=config) 26 | model.to(device) 27 | model.eval() 28 | 29 | 30 | def gen_synonyms(text, n=100, k=20): 31 | """ 32 | 含义: 产生sent的n个相似句,然后返回最相似的k个。 33 | 做法:用seq2seq生成,并用encoder算相似度并排序。 34 | """ 35 | # 寻找所有相似的句子 36 | r = [] 37 | inputs1 = tokenizer(text, return_tensors="pt") 38 | for _ in range(n): 39 | inputs1.to(device) 40 | output = tokenizer.batch_decode(model.generate(**inputs1, 41 | top_p=0.95, 42 | do_sample=True, 43 | max_length=128), 44 | skip_special_tokens=True)[0].replace(" ", "").replace(text, 45 | "") # 去除空格,去除原始text文本。 46 | r.append(output) 47 | 48 | # 对相似的句子进行排序 49 | r = [i for i in set(r) if i != text and len(i) > 0] 50 | r = [text] + r 51 | inputs2 = tokenizer(r, padding=True, return_tensors="pt") 52 | with torch.no_grad(): 53 | inputs2.to(device) 54 | outputs = model(**inputs2) 55 | Z = outputs.pooler_output.cpu().numpy() 56 | Z /= (Z ** 2).sum(axis=1, keepdims=True) ** 0.5 57 | argsort = np.dot(Z[1:], -Z[0]).argsort() 58 | 59 | return [r[i + 1] for i in argsort[:k]] 60 | 61 | 62 | out = gen_synonyms("已经扫码支付,会员季度费18元") 63 | print(out) 64 | -------------------------------------------------------------------------------- /experiments/single_test/roformer_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: roformer_test 6 | Author: czh 7 | Create Date: 2021/9/3 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import torch 13 | from transformers import RoFormerModel, RoFormerForMaskedLM, RoFormerTokenizer 14 | 15 | 16 | text = "时光流逝,但有些日子注定被永久铭记。在抗战胜利76周年纪念日到来之际,各地举办了形式多样的活动,让人们在回望历史中收获心灵的洗礼、得到思想的升华。河南南阳市当地媒体推出“纪念抗战胜利76周年 山河同在”系列报道,与读者一起回望气壮山河的抗日史诗,凝聚兴我中华的磅礴伟力;甘肃永昌县中小学的“开学第一课”以牢记历史为切入点,通过观看抗战专题视频、抗战知识问答等形式,回顾中国人民艰苦抗战的峥嵘岁月;广西桂林市举办纪念抗战胜利76周年文艺演出活动,百余位党员群众追忆革命先辈的艰辛历程,讴歌永远跟党走的坚定誓言。历史是最好的教科书,也是最好的清醒剂。从1931年日本军国主义的铁蹄蹂躏中国东北的白山黑水,到1945年9月2日,日本代表在无条件投降书上签字,十四年抗战的血与火背后,是3500多万同胞伤亡,930余座城市先后被占,4200万难民无家可归。于民族危难之际,中国共产党支撑起救亡图存的希望。从打破“日军不可战胜”神话的平型关大捷,到粉碎侵略者“囚笼政策”的百团大战;从让日军“名将之花”凋谢在太行山上的黄土岭战斗,到打响华中反攻第一枪的车桥战役,——在中国共产党的领导下,无数不甘屈辱的中华儿女前赴后继,以血肉之躯筑起新的长城,赢得了自1840年鸦片战争以来抗击外敌入侵的第一次完全胜利!为争取世界和平的伟大事业,作出了永载史册的重大贡献!战争硝烟早已散去,苦难岁月还需铭记,并非是要背着包袱前行,而是只有牢记来时的路,才能走向更远的前方。正如联合国的呼吁:“我们有责任见证苦难永远不再重演,受难者的记忆被永久尊重。”我们永远不会忘记,“名将以身殉国家,愿拼热血卫吾华”的左权,“未惜头颅新故国,甘将热血沃中华”的赵一曼,弹尽后毅然投江的八名抗联女兵,打完最后一粒子弹后壮烈跳崖的狼牙山五壮士……岁月长河,历史足迹不容磨灭;时代变迁,英雄精神熠熠发光。当76年前的历史场景在这一天再次重现,当战争创伤在和平年代只能靠记忆的方式还原,每一个中华儿女都已然在心中默默地葆有一份肃穆与庄重。" 17 | model_name = "junnyu/roformer_chinese_base" 18 | tokenizer = RoFormerTokenizer.from_pretrained(model_name) 19 | pt_model = RoFormerForMaskedLM.from_pretrained(model_name) 20 | input_ids = tokenizer(text, return_tensors='pt') 21 | 22 | with torch.no_grad(): 23 | pt_outputs = pt_model(**input_ids).logits[0] 24 | pt_outputs_sentence = "pytorch: " 25 | for i, id in enumerate(tokenizer.encode(text)): 26 | if id == tokenizer.mask_token_id: 27 | tokens = tokenizer.convert_ids_to_tokens(pt_outputs[i].topk(k=5)[1]) 28 | pt_outputs_sentence += "[" + "||".join(tokens) + "]" 29 | else: 30 | pt_outputs_sentence += "".join( 31 | tokenizer.convert_ids_to_tokens([id], skip_special_tokens=True)) 32 | print(pt_outputs_sentence) 33 | 34 | -------------------------------------------------------------------------------- /experiments/single_test/unlim_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2023/2/2 18:58 7 | """ 8 | 9 | # https://github.com/Liadrinz/transformers-unilm 10 | from tqdm import tqdm 11 | 12 | from transformers.trainer_seq2seq import Seq2SeqTrainer 13 | from transformers.training_args import TrainingArguments 14 | 15 | from nlp.utils.tokenization_unilm import UniLMTokenizerLiadrinz as UniLMTokenizer 16 | from nlp.models.unilm_model_liadrinz import UniLMForConditionalGeneration 17 | from nlp.processors.unilm_liadrinz_processor import DataCollatorForUniLMSeq2Seq, Seq2SeqDataset 18 | 19 | 20 | # 中文摘要任务生成 21 | news_article = ( 22 | "12月23日,河北石家庄。8岁哥哥轻车熟路哄睡弟弟,姿势标准动作熟练。" 23 | "妈妈杨女士表示:哥哥很喜欢弟弟,因为心思比较细,自己平时带孩子的习惯他都会跟着学习," 24 | "哄睡孩子也都会争着来,技巧很娴熟,两人在一块很有爱,自己感到很幸福,平时帮了自己很大的忙,感恩有这么乖的宝宝。" 25 | ) 26 | 27 | tokenizer = UniLMTokenizer.from_pretrained("Yuang/unilm-base-chinese-news-sum") 28 | model = UniLMForConditionalGeneration.from_pretrained("Yuang/unilm-base-chinese-news-sum") # 在微博新闻摘要数据上fine-tune过的模型 29 | 30 | inputs = tokenizer(news_article, return_tensors="pt") 31 | output_ids = model.generate(**inputs, max_new_tokens=16) 32 | output_text = tokenizer.decode(output_ids[0]) 33 | print(output_text) # "[CLS] [SEP] [SEP]" 34 | news_summary = output_text.split("[SEP]")[1].strip() 35 | print(news_summary) 36 | 37 | # 训练 38 | tokenizer = UniLMTokenizer.from_pretrained("microsoft/unilm-base-cased") 39 | dataset = Seq2SeqDataset(tokenizer, "train.src", "train.tgt", max_src_len=448, max_tgt_len=64) 40 | collator = DataCollatorForUniLMSeq2Seq(tokenizer, mlm=True, mlm_probability=0.7) 41 | model = UniLMForConditionalGeneration.from_pretrained("microsoft/unilm-base-cased") 42 | training_args = TrainingArguments( 43 | output_dir="output_dir", 44 | do_train=True, 45 | per_device_train_batch_size=4, 46 | gradient_accumulation_steps=2, 47 | learning_rate=1e-4, 48 | num_train_epochs=3, 49 | ) 50 | trainer = Seq2SeqTrainer( 51 | model, 52 | args=training_args, 53 | data_collator=collator, 54 | train_dataset=dataset, 55 | tokenizer=tokenizer, 56 | ) 57 | trainer.train() 58 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Emelyanov Anton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /nlp/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__.py 6 | Author: czh 7 | Create Date: 2021/9/10 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/arguments/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__.py 6 | Author: czh 7 | Create Date: 2021/11/11 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from nlp.arguments.train_arguments import TrainingArguments 13 | from nlp.arguments.data_arguments import DataArguments 14 | from nlp.arguments.model_arguments import ModelArguments 15 | -------------------------------------------------------------------------------- /nlp/arguments/model_arguments.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: model_arguments 6 | Author: czh 7 | Create Date: 2021/11/11 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from dataclasses import dataclass, field 13 | from typing import Optional 14 | 15 | 16 | @dataclass 17 | class ModelArguments: 18 | """ 19 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 20 | """ 21 | 22 | model_name_or_path: str = field( 23 | default="hfl/chinese-roberta-wwm-ext", 24 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 25 | ) 26 | 27 | config_name: Optional[str] = field( 28 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 29 | ) 30 | tokenizer_name: Optional[str] = field( 31 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 32 | ) 33 | cache_dir: Optional[str] = field( 34 | default=None, 35 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 36 | ) 37 | use_fast_tokenizer: bool = field( 38 | default=True, 39 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 40 | ) 41 | 42 | use_lstm: bool = field( 43 | default=False, 44 | metadata={"help": "Whether or not to use lstm behind lm model"} 45 | ) 46 | dropout_rate: float = field(default=0.5) 47 | crf_learning_rate: float = field(default=3e-3) 48 | model_type: str = field( 49 | default="bert", 50 | metadata={"help": "Specify the encoder type.", "choices": ["bert", "nezha", "roformer", "albert"]} 51 | ) 52 | do_adv: bool = field( 53 | default=False, 54 | metadata={"help": "Whether to adversarial training."} 55 | ) 56 | adv_epsilon: float = field( 57 | default=1.0, 58 | metadata={"help": "Epsilon for adversarial."} 59 | ) 60 | adv_name: str = field(default='word_embeddings', metadata={"help": "name for adversarial layer."}) 61 | soft_label: bool = field(default=False) 62 | loss_type: str = field(default="ce", metadata={"help": "Loss function", "choices": ['lsr', 'focal', 'ce']}) 63 | 64 | # myparams 65 | reserve_p: float = field( 66 | default=1.0, 67 | metadata={"help": "Will use when use child-tuning"} 68 | ) 69 | mode: str = field( 70 | default=None, 71 | metadata={"help": "Specify what mode will be used for Child-Tuning. eg:'ChildTuning-D', 'ChildTuning-F'"} 72 | ) 73 | rdrop_alpha: int = field(default=5, metadata={"help": "Rdrop alpha value, only when use rdrop"}) 74 | rope: bool = field(default=False, metadata={"help": "Whether use RoPositionEmbedding or not"}) 75 | -------------------------------------------------------------------------------- /nlp/callback/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/nlp/callback/__init__.py -------------------------------------------------------------------------------- /nlp/callback/optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhihao-chen/NLP-experiments/24265c645fcadbc8a3117391823f8cf3b88128c8/nlp/callback/optimizers/__init__.py -------------------------------------------------------------------------------- /nlp/callback/optimizers/novograd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | from torch.optim.optimizer import Optimizer 4 | 5 | 6 | class NovoGrad(Optimizer): 7 | """Implements NovoGrad algorithm. 8 | Arguments: 9 | params (iterable): iterable of parameters to optimize or dicts defining 10 | parameter groups 11 | lr (float, optional): learning rate (default: 1e-2) 12 | betas (Tuple[float, float], optional): coefficients used for computing 13 | running averages of gradient and its square (default: (0.95, 0.98)) 14 | eps (float, optional): term added to the denominator to improve 15 | numerical stability (default: 1e-8) 16 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 17 | Example: 18 | model = ResNet() 19 | optimizer = NovoGrad(model.parameters(), lr=1e-2, weight_decay=1e-5) 20 | """ 21 | 22 | def __init__(self, params, lr=0.01, betas=(0.95, 0.98), eps=1e-8, 23 | weight_decay=0, grad_averaging=False): 24 | if lr < 0.0: 25 | raise ValueError("Invalid learning rate: {}".format(lr)) 26 | if not 0.0 <= betas[0] < 1.0: 27 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 28 | if not 0.0 <= betas[1] < 1.0: 29 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 30 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging) 31 | super().__init__(params, defaults) 32 | 33 | def step(self, closure=None): 34 | loss = None 35 | if closure is not None: 36 | loss = closure() 37 | for group in self.param_groups: 38 | for p in group['params']: 39 | if p.grad is None: 40 | continue 41 | grad = p.grad.data 42 | if grad.is_sparse: 43 | raise RuntimeError('NovoGrad does not support sparse gradients') 44 | state = self.state[p] 45 | g_2 = torch.sum(grad ** 2) 46 | if len(state) == 0: 47 | state['step'] = 0 48 | state['moments'] = grad.div(g_2.sqrt() + group['eps']) + group['weight_decay'] * p.data 49 | state['grads_ema'] = g_2 50 | moments = state['moments'] 51 | grads_ema = state['grads_ema'] 52 | beta1, beta2 = group['betas'] 53 | state['step'] += 1 54 | grads_ema.mul_(beta2).add_(1 - beta2, g_2) 55 | 56 | denom = grads_ema.sqrt().add_(group['eps']) 57 | grad.div_(denom) 58 | # weight decay 59 | if group['weight_decay'] != 0: 60 | decayed_weights = torch.mul(p.data, group['weight_decay']) 61 | grad.add_(decayed_weights) 62 | 63 | # Momentum --> SAG 64 | if group['grad_averaging']: 65 | grad.mul_(1.0 - beta1) 66 | 67 | moments.mul_(beta1).add_(grad) # velocity 68 | 69 | bias_correction1 = 1 - beta1 ** state['step'] 70 | bias_correction2 = 1 - beta2 ** state['step'] 71 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 72 | p.data.add_(-step_size, moments) 73 | 74 | return loss 75 | -------------------------------------------------------------------------------- /nlp/callback/optimizers/planradam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | from torch.optim.optimizer import Optimizer 4 | 5 | 6 | class PlainRAdam(Optimizer): 7 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 8 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 9 | 10 | super(PlainRAdam, self).__init__(params, defaults) 11 | 12 | def __setstate__(self, state): 13 | super(PlainRAdam, self).__setstate__(state) 14 | 15 | def step(self, closure=None): 16 | 17 | loss = None 18 | if closure is not None: 19 | loss = closure() 20 | 21 | for group in self.param_groups: 22 | 23 | for p in group['params']: 24 | if p.grad is None: 25 | continue 26 | grad = p.grad.data.float() 27 | if grad.is_sparse: 28 | raise RuntimeError('RAdam does not support sparse gradients') 29 | 30 | p_data_fp32 = p.data.float() 31 | 32 | state = self.state[p] 33 | 34 | if len(state) == 0: 35 | state['step'] = 0 36 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 37 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 38 | else: 39 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 40 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 41 | 42 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 43 | beta1, beta2 = group['betas'] 44 | 45 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 46 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 47 | 48 | state['step'] += 1 49 | beta2_t = beta2 ** state['step'] 50 | n_sma_max = 2 / (1 - beta2) - 1 51 | n_sma = n_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 52 | 53 | if group['weight_decay'] != 0: 54 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 55 | 56 | # more conservative since it's an approximated value 57 | if n_sma >= 5: 58 | step_size = group['lr'] * math.sqrt((1 - beta2_t) * (n_sma - 4) / (n_sma_max - 4) * 59 | (n_sma - 2) / n_sma * 60 | n_sma_max / (n_sma_max - 2)) / (1 - beta1 ** state['step']) 61 | denom = exp_avg_sq.sqrt().add_(group['eps']) 62 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 63 | else: 64 | step_size = group['lr'] / (1 - beta1 ** state['step']) 65 | p_data_fp32.add_(-step_size, exp_avg) 66 | 67 | p.data.copy_(p_data_fp32) 68 | 69 | return loss 70 | -------------------------------------------------------------------------------- /nlp/callback/progressbar.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Dict 3 | 4 | 5 | class ProgressBar(object): 6 | """ 7 | custom progress bar 8 | Example: 9 | pbar = ProgressBar(n_total=30,desc='training') 10 | step = 2 11 | pbar(step=step) 12 | """ 13 | def __init__(self, n_total, width=30, desc='Training'): 14 | self.width = width 15 | self.n_total = n_total 16 | self.start_time = time.time() 17 | self.desc = desc 18 | 19 | def __call__(self, step, info: Dict = None): 20 | now = time.time() 21 | current = step + 1 22 | recv_per = current / self.n_total 23 | bar = f'[{self.desc}] {current}/{self.n_total} [' 24 | if recv_per >= 1: 25 | recv_per = 1 26 | prog_width = int(self.width * recv_per) 27 | if prog_width > 0: 28 | bar += '=' * (prog_width - 1) 29 | if current < self.n_total: 30 | bar += ">" 31 | else: 32 | bar += '=' 33 | bar += '.' * (self.width - prog_width) 34 | bar += ']' 35 | show_bar = f"\r{bar}" 36 | time_per_unit = (now - self.start_time) / current 37 | if current < self.n_total: 38 | eta = time_per_unit * (self.n_total - current) 39 | if eta > 3600: 40 | eta_format = ('%d:%02d:%02d' % 41 | (eta // 3600, (eta % 3600) // 60, eta % 60)) 42 | elif eta > 60: 43 | eta_format = '%d:%02d' % (eta // 60, eta % 60) 44 | else: 45 | eta_format = '%ds' % eta 46 | time_info = f' - ETA: {eta_format}' 47 | else: 48 | if time_per_unit >= 1: 49 | time_info = f' {time_per_unit:.1f}s/step' 50 | elif time_per_unit >= 1e-3: 51 | time_info = f' {time_per_unit * 1e3:.1f}ms/step' 52 | else: 53 | time_info = f' {time_per_unit * 1e6:.1f}us/step' 54 | 55 | show_bar += time_info 56 | if len(info) != 0: 57 | show_info = f'{show_bar} ' + \ 58 | "-".join([f' {key}: {value:.4f} ' for key, value in info.items()]) 59 | print(show_info, end='') 60 | else: 61 | print(show_bar, end='') 62 | -------------------------------------------------------------------------------- /nlp/callback/trainingmonitor.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | from typing import Dict 3 | import numpy as np 4 | from pathlib import Path 5 | import matplotlib.pyplot as plt 6 | from nlp.tools.common import load_json 7 | from nlp.tools.common import save_json 8 | 9 | plt.switch_backend('agg') 10 | 11 | 12 | class TrainingMonitor(object): 13 | def __init__(self, file_dir, arch, add_test=False): 14 | """ 15 | 重新开始训练的epoch点 16 | """ 17 | if isinstance(file_dir, Path): 18 | pass 19 | else: 20 | file_dir = Path(file_dir) 21 | file_dir.mkdir(parents=True, exist_ok=True) 22 | 23 | self.arch = arch 24 | self.file_dir = file_dir 25 | self.H = {} 26 | self.add_test = add_test 27 | self.json_path = file_dir / (arch + "_training_monitor.json") 28 | self.paths = {} 29 | 30 | def reset(self, start_at): 31 | if start_at > 0: 32 | if self.json_path is not None: 33 | if self.json_path.exists(): 34 | self.H = load_json(self.json_path) 35 | for k in self.H.keys(): 36 | self.H[k] = self.H[k][:start_at] 37 | 38 | def epoch_step(self, logs: Dict): 39 | for (k, v) in logs.items(): 40 | alist = self.H.get(k, []) 41 | # np.float32会报错 42 | if not isinstance(v, np.float): 43 | v = round(float(v), 4) 44 | alist.append(v) 45 | self.H[k] = alist 46 | 47 | # 写入文件 48 | if self.json_path is not None: 49 | save_json(data=self.H, file_path=self.json_path) 50 | 51 | # 保存train图像 52 | if len(self.H["loss"]) == 1: 53 | self.paths = {key: self.file_dir / (self.arch + f'_{key.upper()}') for key in self.H.keys()} 54 | 55 | if len(self.H["loss"]) > 1: 56 | # 指标变化 57 | # 曲线 58 | # 需要成对出现 59 | keys = [key for key, _ in self.H.items() if '_' not in key] 60 | for key in keys: 61 | array = np.arange(0, len(self.H[key])) 62 | plt.style.use("ggplot") 63 | plt.figure() 64 | plt.plot(array, self.H[key], label=f"train_{key}") 65 | plt.plot(array, self.H[f"valid_{key}"], label=f"valid_{key}") 66 | if self.add_test: 67 | plt.plot(array, self.H[f"test_{key}"], label=f"test_{key}") 68 | plt.legend() 69 | plt.xlabel("Epoch #") 70 | plt.ylabel(key) 71 | plt.title(f"Training {key} [Epoch {len(self.H[key])}]") 72 | plt.savefig(str(self.paths[key])) 73 | plt.close() 74 | -------------------------------------------------------------------------------- /nlp/event_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__.py 6 | Author: czh 7 | Create Date: 2021/9/23 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__.py 6 | Author: czh 7 | Create Date: 2021/11/15 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__ 6 | Author: czh 7 | Create Date: 2021/8/12 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__ 6 | Author: czh 7 | Create Date: 2021/9/10 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/metrics/sematic_match_metric.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/6/30 10:54 7 | """ 8 | import numpy as np 9 | import scipy.stats as sta 10 | 11 | 12 | def l2_normalize(vecs): 13 | """标准化 14 | """ 15 | norms = (vecs**2).sum(axis=1, keepdims=True)**0.5 16 | return vecs / np.clip(norms, 1e-8, np.inf) 17 | 18 | 19 | def compute_corrcoef(x, y): 20 | """Spearman相关系数 21 | """ 22 | return sta.spearmanr(x, y).correlation 23 | 24 | 25 | def compute_pearsonr(x, y): 26 | # 输出:(r, p) 27 | # r:相关系数[-1,1]之间 28 | # p:相关系数显著性 29 | # 所有下面的数据选第零位 30 | return sta.pearsonr(x, y)[0] 31 | -------------------------------------------------------------------------------- /nlp/metrics/triplet_distance_metric.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/6/30 11:01 7 | """ 8 | # The metric for the triplet loss 9 | from torch.nn import functional as nnf 10 | 11 | 12 | def cosin(x, y): 13 | return 1 - nnf.cosine_similarity(x, y) 14 | 15 | 16 | def euclidean(x, y): 17 | return nnf.pairwise_distance(x, y, p=2) 18 | 19 | 20 | def manhattan(x, y): 21 | return nnf.pairwise_distance(x, y, p=1) 22 | -------------------------------------------------------------------------------- /nlp/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__ 6 | Author: czh 7 | Create Date: 2021/8/12 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/models/bert_for_ee.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: bert_for_ee 6 | Author: czh 7 | Create Date: 2021/9/8 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import torch.nn as nn 13 | 14 | from transformers import BertModel, BertPreTrainedModel, BertConfig, BertTokenizer 15 | 16 | from nlp.models.nezha import NeZhaModel, NeZhaConfig 17 | from nlp.layers.crf import CRF 18 | 19 | 20 | # 参考苏剑林的方法,https://github.com/bojone/lic2020_baselines/blob/master/ee.py 21 | class BertCRFForDuEE1Su(BertPreTrainedModel): 22 | def __init__(self, config, train_config): 23 | super(BertCRFForDuEE1Su, self).__init__(config) 24 | if train_config.model_type == "bert": 25 | self.bert = BertModel(config) 26 | elif train_config.model_type == "nezha": 27 | self.bert = NeZhaModel(config) 28 | else: 29 | raise ValueError("'model_type' must be 'bert' or 'nezha'") 30 | 31 | self.use_lstm = train_config.use_lstm 32 | self.dropout = nn.Dropout(train_config.dropout_rate) 33 | self.classifier = nn.Linear(config.hidden_size, config.num_labels) 34 | self.crf = CRF(num_tags=config.num_labels, batch_first=True) 35 | if self.use_lstm: 36 | self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size // 2, 37 | num_layers=1, bidirectional=True, batch_first=True) 38 | self.init_weights() 39 | 40 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): 41 | outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 42 | sequence_output = outputs[0] 43 | if self.use_lstm: 44 | sequence_output, _ = self.lstm(sequence_output) 45 | sequence_output = self.dropout(sequence_output) 46 | logits = self.classifier(sequence_output) 47 | outputs = (logits,) 48 | if labels is not None: 49 | loss = -1 * self.crf(emissions=logits, tags=labels, mask=attention_mask) 50 | outputs = (loss,)+outputs 51 | return outputs # (loss), scores 52 | 53 | 54 | MODEL_TYPE_CLASSES = { 55 | "bert": (BertConfig, BertTokenizer, BertCRFForDuEE1Su), 56 | "nezha": (NeZhaConfig, BertTokenizer, BertCRFForDuEE1Su) 57 | } 58 | -------------------------------------------------------------------------------- /nlp/models/idcnn_for_crf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: idcnn_for_crf 6 | Author: czh 7 | Create Date: 2022/2/22 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | import numpy as np 13 | import torch 14 | import torch.nn as nn 15 | from nlp.layers.cnn import IDCNN 16 | from nlp.layers.crf import CRF 17 | from torch.nn import functional as func 18 | 19 | 20 | class IDCNNForCRF(nn.Module): 21 | def __init__(self, 22 | vocab_size, 23 | word_embedding_dim, 24 | word2id, 25 | num_tag, 26 | embedding_file=None, 27 | dropout_rate=0.5, 28 | nil=True): 29 | super(IDCNNForCRF, self).__init__() 30 | self.embedding = nn.Embedding(vocab_size, word_embedding_dim) 31 | self.embedding_file = embedding_file 32 | 33 | self.embedding.weight.data.copy_( 34 | torch.from_numpy( 35 | self.get_embedding(vocab_size, 36 | word_embedding_dim, 37 | word2id, 38 | nil))) 39 | self.idcnn = IDCNN(input_size=word_embedding_dim, filters=64) 40 | self.linear = nn.Linear(64, 256) 41 | self.out = nn.Linear(256, num_tag) 42 | 43 | self.crf = CRF(num_tags=num_tag) 44 | self.dropout_layer = nn.Dropout(dropout_rate) 45 | 46 | def forward(self, inputs, length, labels=None): 47 | embeddings = self.embedding(inputs) 48 | embeddings = self.dropout_layer(embeddings) 49 | out = self.idcnn(embeddings, length) 50 | out = self.linear(out) 51 | out = self.out(out) 52 | logits = func.dropout(out, p=0.1, training=self.training) 53 | output = {'logits': logits} 54 | if labels is not None: 55 | loss = -1 * self.crf(emissions=logits, tags=labels) 56 | output["loss"] = loss 57 | return output 58 | 59 | def parse_word_vector(self, word_index, embedding_dim): 60 | pre_trained_wordvector = {} 61 | f = open(self.embedding_file, encoding='utf-8') 62 | fr = f.readlines() 63 | for line in fr[1:]: 64 | lines = line.strip().split(' ') 65 | word = lines[0] 66 | if len(word) == 1: 67 | if word_index.get(word) is not None: 68 | vector = [float(f) for f in lines[1:embedding_dim + 1]] 69 | pre_trained_wordvector[word] = vector 70 | else: 71 | continue 72 | else: 73 | continue 74 | return pre_trained_wordvector 75 | 76 | def get_embedding(self, vocab_size, embedding_dim, word2id, nil=True): 77 | print('Get embedding...') 78 | embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32) 79 | if not nil: 80 | pre_trained_wordector = self.parse_word_vector(word2id, embedding_dim) 81 | for word, idx in word2id.items(): 82 | try: 83 | word_vector = pre_trained_wordector[word] 84 | embedding_matrix[id] = word_vector 85 | except: 86 | continue 87 | print('Get embedding done!') 88 | return embedding_matrix 89 | -------------------------------------------------------------------------------- /nlp/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__ 6 | Author: czh 7 | Create Date: 2021/8/17 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/processors/predict_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: predict_process 6 | Author: czh 7 | Create Date: 2022/2/9 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | # 处理预测结果,例如提取实体和关系 13 | 14 | from typing import Dict, List 15 | 16 | import torch 17 | import numpy as np 18 | 19 | 20 | def global_pointer_entity_extract(pred_logits: torch.Tensor, 21 | id2entity: Dict[int, str], 22 | entity_type_names: dict) -> List[List[dict]]: 23 | batch_size = pred_logits.size(0) 24 | pred_logits = pred_logits.cpu().numpy() 25 | 26 | pred_list = [[] for i in range(batch_size)] 27 | for bs, label_id, start, end in zip(*np.where(pred_logits > 0)): 28 | label = id2entity[label_id] 29 | label_name = entity_type_names[label] 30 | res = {'label': label, 'label_name': label_name, 'start': start, 'end': end} 31 | pred_list[bs].append(res) 32 | 33 | return pred_list 34 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/LoggingHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tqdm 3 | 4 | class LoggingHandler(logging.Handler): 5 | def __init__(self, level=logging.NOTSET): 6 | super().__init__(level) 7 | 8 | def emit(self, record): 9 | try: 10 | msg = self.format(record) 11 | tqdm.tqdm.write(msg) 12 | self.flush() 13 | except (KeyboardInterrupt, SystemExit): 14 | raise 15 | except: 16 | self.handleError(record) -------------------------------------------------------------------------------- /nlp/sentence_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.9" 2 | __DOWNLOAD_SERVER__ = 'https://sbert.net/models/' 3 | from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset 4 | from .LoggingHandler import LoggingHandler 5 | from .SentenceTransformer import SentenceTransformer 6 | from .readers import InputExample 7 | from .cross_encoder.CrossEncoder import CrossEncoder 8 | 9 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/cross_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .CrossEncoder import CrossEncoder -------------------------------------------------------------------------------- /nlp/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from sklearn.metrics import average_precision_score 3 | from typing import List 4 | import numpy as np 5 | import os 6 | import csv 7 | 8 | from ... import InputExample 9 | from ...evaluation import BinaryClassificationEvaluator 10 | 11 | class CEBinaryClassificationEvaluator: 12 | """ 13 | This evaluator can be used with the CrossEncoder class. Given sentence pairs and binary labels (0 and 1), 14 | it compute the average precision and the best possible f1 score 15 | """ 16 | def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str=''): 17 | assert len(sentence_pairs) == len(labels) 18 | for label in labels: 19 | assert (label == 0 or label == 1) 20 | 21 | self.sentence_pairs = sentence_pairs 22 | self.labels = np.asarray(labels) 23 | self.name = name 24 | 25 | self.csv_file = "CEBinaryClassificationEvaluator" + ("_" + name if name else '') + "_results.csv" 26 | self.csv_headers = ["epoch", "steps", "Accuracy", "Accuracy_Threshold", "F1", "F1_Threshold", "Precision", "Recall", "Average_Precision"] 27 | 28 | @classmethod 29 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 30 | sentence_pairs = [] 31 | labels = [] 32 | 33 | for example in examples: 34 | sentence_pairs.append(example.texts) 35 | labels.append(example.label) 36 | return cls(sentence_pairs, labels, **kwargs) 37 | 38 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 39 | if epoch != -1: 40 | if steps == -1: 41 | out_txt = " after epoch {}:".format(epoch) 42 | else: 43 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 44 | else: 45 | out_txt = ":" 46 | 47 | logging.info("CEBinaryClassificationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 48 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 49 | 50 | acc, acc_threshold = BinaryClassificationEvaluator.find_best_acc_and_threshold(pred_scores, self.labels, True) 51 | f1, precision, recall, f1_threshold = BinaryClassificationEvaluator.find_best_f1_and_threshold(pred_scores, self.labels, True) 52 | ap = average_precision_score(self.labels, pred_scores) 53 | 54 | logging.info("Accuracy: {:.2f}\t(Threshold: {:.4f})".format(acc * 100, acc_threshold)) 55 | logging.info("F1: {:.2f}\t(Threshold: {:.4f})".format(f1 * 100, f1_threshold)) 56 | logging.info("Precision: {:.2f}".format(precision * 100)) 57 | logging.info("Recall: {:.2f}".format(recall * 100)) 58 | logging.info("Average Precision: {:.2f}\n".format(ap * 100)) 59 | 60 | if output_path is not None: 61 | csv_path = os.path.join(output_path, self.csv_file) 62 | output_file_exists = os.path.isfile(csv_path) 63 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 64 | writer = csv.writer(f) 65 | if not output_file_exists: 66 | writer.writerow(self.csv_headers) 67 | 68 | writer.writerow([epoch, steps, acc, acc_threshold, f1, f1_threshold, precision, recall, ap]) 69 | 70 | 71 | return ap -------------------------------------------------------------------------------- /nlp/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from scipy.stats import pearsonr, spearmanr 3 | from typing import List 4 | import os 5 | import csv 6 | from ... import InputExample 7 | 8 | class CECorrelationEvaluator: 9 | """ 10 | This evaluator can be used with the CrossEncoder class. Given sentence pairs and continuous scores, 11 | it compute the pearson & spearman correlation between the predicted score for the sentence pair 12 | and the gold score. 13 | """ 14 | def __init__(self, sentence_pairs: List[List[str]], scores: List[float], name: str=''): 15 | self.sentence_pairs = sentence_pairs 16 | self.scores = scores 17 | self.name = name 18 | 19 | self.csv_file = "CECorrelationEvaluator" + ("_" + name if name else '') + "_results.csv" 20 | self.csv_headers = ["epoch", "steps", "Pearson_Correlation", "Spearman_Correlation"] 21 | 22 | @classmethod 23 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 24 | sentence_pairs = [] 25 | scores = [] 26 | 27 | for example in examples: 28 | sentence_pairs.append(example.texts) 29 | scores.append(example.label) 30 | return cls(sentence_pairs, scores, **kwargs) 31 | 32 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 33 | if epoch != -1: 34 | if steps == -1: 35 | out_txt = " after epoch {}:".format(epoch) 36 | else: 37 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 38 | else: 39 | out_txt = ":" 40 | 41 | logging.info("CECorrelationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 42 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 43 | 44 | 45 | eval_pearson, _ = pearsonr(self.scores, pred_scores) 46 | eval_spearman, _ = spearmanr(self.scores, pred_scores) 47 | 48 | logging.info("Correlation:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson, eval_spearman)) 49 | 50 | if output_path is not None: 51 | csv_path = os.path.join(output_path, self.csv_file) 52 | output_file_exists = os.path.isfile(csv_path) 53 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 54 | writer = csv.writer(f) 55 | if not output_file_exists: 56 | writer.writerow(self.csv_headers) 57 | 58 | writer.writerow([epoch, steps, eval_pearson, eval_spearman]) 59 | 60 | return eval_spearman -------------------------------------------------------------------------------- /nlp/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import csv 4 | from typing import List 5 | from ... import InputExample 6 | import numpy as np 7 | 8 | class CESoftmaxAccuracyEvaluator: 9 | """ 10 | This evaluator can be used with the CrossEncoder class. 11 | 12 | It is designed for CrossEncoders with 2 or more outputs. It measure the 13 | accuracy of the predict class vs. the gold labels. 14 | """ 15 | def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str=''): 16 | self.sentence_pairs = sentence_pairs 17 | self.labels = labels 18 | self.name = name 19 | 20 | self.csv_file = "CESoftmaxAccuracyEvaluator" + ("_" + name if name else '') + "_results.csv" 21 | self.csv_headers = ["epoch", "steps", "Accuracy"] 22 | 23 | @classmethod 24 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 25 | sentence_pairs = [] 26 | labels = [] 27 | 28 | for example in examples: 29 | sentence_pairs.append(example.texts) 30 | labels.append(example.label) 31 | return cls(sentence_pairs, labels, **kwargs) 32 | 33 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 34 | if epoch != -1: 35 | if steps == -1: 36 | out_txt = " after epoch {}:".format(epoch) 37 | else: 38 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 39 | else: 40 | out_txt = ":" 41 | 42 | logging.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 43 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 44 | pred_labels = np.argmax(pred_scores, axis=1) 45 | 46 | assert len(pred_labels) == len(self.labels) 47 | 48 | acc = np.sum(pred_labels == self.labels) / len(self.labels) 49 | 50 | logging.info("Accuracy: {:.2f}".format(acc*100)) 51 | 52 | if output_path is not None: 53 | csv_path = os.path.join(output_path, self.csv_file) 54 | output_file_exists = os.path.isfile(csv_path) 55 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 56 | writer = csv.writer(f) 57 | if not output_file_exists: 58 | writer.writerow(self.csv_headers) 59 | 60 | writer.writerow([epoch, steps, acc]) 61 | 62 | return acc -------------------------------------------------------------------------------- /nlp/sentence_transformers/cross_encoder/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator 2 | from .CECorrelationEvaluator import CECorrelationEvaluator 3 | from .CESoftmaxAccuracyEvaluator import CESoftmaxAccuracyEvaluator 4 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/datasets/EncodeDataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from typing import List, Union 3 | from .. import SentenceTransformer 4 | 5 | 6 | class EncodeDataset(Dataset): 7 | def __init__(self, 8 | sentences: Union[List[str], List[int]], 9 | model: SentenceTransformer, 10 | is_tokenized: bool = True): 11 | """ 12 | EncodeDataset is used by SentenceTransformer.encode method. It just stores 13 | the input texts and returns a tokenized version of it. 14 | """ 15 | self.model = model 16 | self.sentences = sentences 17 | self.is_tokenized = is_tokenized 18 | 19 | 20 | def __getitem__(self, item): 21 | return self.sentences[item] if self.is_tokenized else self.model.tokenize(self.sentences[item]) 22 | 23 | 24 | def __len__(self): 25 | return len(self.sentences) 26 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/datasets/SentencesDataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from typing import List 3 | import torch 4 | from .. import SentenceTransformer 5 | from ..readers.InputExample import InputExample 6 | 7 | class SentencesDataset(Dataset): 8 | """ 9 | Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all 10 | sequences to the max length. 11 | The SentenceBertEncoder.smart_batching_collate is required for this to work. 12 | SmartBatchingDataset does *not* work without it. 13 | """ 14 | def __init__(self, 15 | examples: List[InputExample], 16 | model: SentenceTransformer 17 | ): 18 | """ 19 | Create a new SentencesDataset with the tokenized texts and the labels as Tensor 20 | 21 | :param examples 22 | A list of sentence.transformers.readers.InputExample 23 | :param model: 24 | SentenceTransformerModel 25 | """ 26 | self.model = model 27 | self.examples = examples 28 | self.label_type = torch.long if isinstance(self.examples[0].label, int) else torch.float 29 | 30 | 31 | def __getitem__(self, item): 32 | label = torch.tensor(self.examples[item].label, dtype=self.label_type) 33 | if self.examples[item].texts_tokenized is None: 34 | self.examples[item].texts_tokenized = [self.model.tokenize(text) for text in self.examples[item].texts] 35 | 36 | return self.examples[item].texts_tokenized, label 37 | 38 | 39 | def __len__(self): 40 | return len(self.examples) 41 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import * 2 | from .ParallelSentencesDataset import ParallelSentencesDataset 3 | from .SentenceLabelDataset import SentenceLabelDataset 4 | from .SentencesDataset import SentencesDataset 5 | from .EncodeDataset import EncodeDataset 6 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/datasets/sampler/LabelSampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties. 3 | """ 4 | from torch.utils.data import Sampler 5 | import numpy as np 6 | from ...datasets import SentenceLabelDataset 7 | 8 | 9 | class LabelSampler(Sampler): 10 | """ 11 | This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS 12 | or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch. 13 | 14 | It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label. 15 | 16 | Labels with fewer than n unique samples are ignored. 17 | This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped. 18 | 19 | This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible 20 | by the samples drawn per label. 21 | 22 | 23 | """ 24 | def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5, 25 | with_replacement: bool = False): 26 | """ 27 | Creates a LabelSampler for a SentenceLabelDataset. 28 | 29 | :param data_source: 30 | the dataset from which samples are drawn 31 | :param samples_per_label: 32 | the number of consecutive, random and unique samples drawn per label 33 | :param with_replacement: 34 | if this is True, then each sample is drawn at most once (depending on the total number of samples per label). 35 | if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same 36 | drawing. 37 | """ 38 | super().__init__(data_source) 39 | self.data_source = data_source 40 | self.samples_per_label = samples_per_label 41 | self.label_range = np.arange(data_source.num_labels) 42 | self.borders = data_source.groups_right_border 43 | self.with_replacement = with_replacement 44 | np.random.shuffle(self.label_range) 45 | 46 | def __iter__(self): 47 | label_idx = 0 48 | count = 0 49 | already_seen = {} 50 | while count < len(self.data_source): 51 | label = self.label_range[label_idx] 52 | if label not in already_seen: 53 | already_seen[label] = set() 54 | 55 | left_border = 0 if label == 0 else self.borders[label-1] 56 | right_border = self.borders[label] 57 | 58 | if self.with_replacement: 59 | selection = np.arange(left_border, right_border) 60 | else: 61 | selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]] 62 | 63 | if len(selection) >= self.samples_per_label: 64 | for element_idx in np.random.choice(selection, self.samples_per_label, replace=False): 65 | count += 1 66 | already_seen[label].add(element_idx) 67 | yield element_idx 68 | 69 | label_idx += 1 70 | if label_idx >= len(self.label_range): 71 | label_idx = 0 72 | already_seen = {} 73 | np.random.shuffle(self.label_range) 74 | 75 | def __len__(self): 76 | return len(self.data_source) -------------------------------------------------------------------------------- /nlp/sentence_transformers/datasets/sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from .LabelSampler import * -------------------------------------------------------------------------------- /nlp/sentence_transformers/evaluation/LabelAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from tqdm import tqdm 6 | from ..util import batch_to_device 7 | import os 8 | import csv 9 | 10 | class LabelAccuracyEvaluator(SentenceEvaluator): 11 | """ 12 | Evaluate a model based on its accuracy on a labeled dataset 13 | 14 | This requires a model with LossFunction.SOFTMAX 15 | 16 | The results are written in a CSV. If a CSV already exists, then values are appended. 17 | """ 18 | 19 | def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None): 20 | """ 21 | Constructs an evaluator for the given dataset 22 | 23 | :param dataloader: 24 | the data for the evaluation 25 | """ 26 | self.dataloader = dataloader 27 | self.name = name 28 | self.softmax_model = softmax_model 29 | 30 | if name: 31 | name = "_"+name 32 | 33 | self.csv_file = "accuracy_evaluation"+name+"_results.csv" 34 | self.csv_headers = ["epoch", "steps", "accuracy"] 35 | 36 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 37 | model.eval() 38 | total = 0 39 | correct = 0 40 | 41 | if epoch != -1: 42 | if steps == -1: 43 | out_txt = " after epoch {}:".format(epoch) 44 | else: 45 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 46 | else: 47 | out_txt = ":" 48 | 49 | logging.info("Evaluation on the "+self.name+" dataset"+out_txt) 50 | self.dataloader.collate_fn = model.smart_batching_collate 51 | for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): 52 | features, label_ids = batch_to_device(batch, model.device) 53 | with torch.no_grad(): 54 | _, prediction = self.softmax_model(features, labels=None) 55 | 56 | total += prediction.size(0) 57 | correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() 58 | accuracy = correct/total 59 | 60 | logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total)) 61 | 62 | if output_path is not None: 63 | csv_path = os.path.join(output_path, self.csv_file) 64 | if not os.path.isfile(csv_path): 65 | with open(csv_path, mode="w", encoding="utf-8") as f: 66 | writer = csv.writer(f) 67 | writer.writerow(self.csv_headers) 68 | writer.writerow([epoch, steps, accuracy]) 69 | else: 70 | with open(csv_path, mode="a", encoding="utf-8") as f: 71 | writer = csv.writer(f) 72 | writer.writerow([epoch, steps, accuracy]) 73 | 74 | return accuracy 75 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/evaluation/MSEEvaluator.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers.evaluation import SentenceEvaluator 2 | import numpy as np 3 | import logging 4 | import os 5 | import csv 6 | from typing import List 7 | 8 | class MSEEvaluator(SentenceEvaluator): 9 | """ 10 | Computes the mean squared error (x100) between the computed sentence embedding 11 | and some target sentence embedding. 12 | 13 | The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||. 14 | 15 | For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English 16 | and target_sentences are in a different language like German, Chinese, Spanish... 17 | 18 | :param source_sentences: Source sentences are embedded with the teacher model 19 | :param target_sentences: Target sentences are ambedding with the student model. 20 | :param show_progress_bar: Show progress bar when computing embeddings 21 | :param batch_size: Batch size to compute sentence embeddings 22 | :param name: Name of the evaluator 23 | """ 24 | def __init__(self, source_sentences: List[str], target_sentences: List[str], teacher_model = None, show_progress_bar: bool = False, batch_size: int = 32, name: str = ''): 25 | self.source_embeddings = teacher_model.encode(source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True) 26 | 27 | self.target_sentences = target_sentences 28 | self.show_progress_bar = show_progress_bar 29 | self.batch_size = batch_size 30 | self.name = name 31 | 32 | self.csv_file = "mse_evaluation_" + name + "_results.csv" 33 | self.csv_headers = ["epoch", "steps", "MSE"] 34 | 35 | def __call__(self, model, output_path, epoch = -1, steps = -1): 36 | if epoch != -1: 37 | if steps == -1: 38 | out_txt = " after epoch {}:".format(epoch) 39 | else: 40 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 41 | else: 42 | out_txt = ":" 43 | 44 | target_embeddings = model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=True) 45 | 46 | mse = ((self.source_embeddings - target_embeddings)**2).mean() 47 | mse *= 100 48 | 49 | logging.info("MSE evaluation (lower = better) on "+self.name+" dataset"+out_txt) 50 | logging.info("MSE (*100):\t{:4f}".format(mse)) 51 | 52 | if output_path is not None: 53 | csv_path = os.path.join(output_path, self.csv_file) 54 | output_file_exists = os.path.isfile(csv_path) 55 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 56 | writer = csv.writer(f) 57 | if not output_file_exists: 58 | writer.writerow(self.csv_headers) 59 | 60 | writer.writerow([epoch, steps, mse]) 61 | 62 | return -mse #Return negative score as SentenceTransformers maximizes the performance -------------------------------------------------------------------------------- /nlp/sentence_transformers/evaluation/SentenceEvaluator.py: -------------------------------------------------------------------------------- 1 | class SentenceEvaluator: 2 | """ 3 | Base class for all evaluators 4 | 5 | Extend this class and implement __call__ for custom evaluators. 6 | """ 7 | 8 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 9 | """ 10 | This is called during training to evaluate the model. 11 | It returns a score for the evaluation with a higher score indicating a better result. 12 | 13 | :param model: 14 | the model to evaluate 15 | :param output_path: 16 | path where predictions and metrics are written to 17 | :param epoch 18 | the epoch where the evaluation takes place. 19 | This is used for the file prefixes. 20 | If this is -1, then we assume evaluation on test data. 21 | :param steps 22 | the steps in the current epoch at time of the evaluation. 23 | This is used for the file prefixes. 24 | If this is -1, then we assume evaluation at the end of the epoch. 25 | :return: a score for the evaluation with a higher score indicating a better result 26 | """ 27 | pass 28 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/evaluation/SequentialEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | from typing import Iterable 3 | 4 | class SequentialEvaluator(SentenceEvaluator): 5 | """ 6 | This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated, 7 | the data is passed sequentially to all sub-evaluators. 8 | 9 | All scores are passed to 'main_score_function', which derives one final score value 10 | """ 11 | def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]): 12 | self.evaluators = evaluators 13 | self.main_score_function = main_score_function 14 | 15 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 16 | scores = [] 17 | for evaluator in self.evaluators: 18 | scores.append(evaluator(model, output_path, epoch, steps)) 19 | 20 | return self.main_score_function(scores) 21 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/evaluation/SimilarityFunction.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class SimilarityFunction(Enum): 4 | COSINE = 0 5 | EUCLIDEAN = 1 6 | MANHATTAN = 2 7 | DOT_PRODUCT = 3 8 | 9 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .SentenceEvaluator import SentenceEvaluator 2 | from .SimilarityFunction import SimilarityFunction 3 | 4 | from .BinaryClassificationEvaluator import BinaryClassificationEvaluator 5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator 6 | 7 | from .InformationRetrievalEvaluator import InformationRetrievalEvaluator 8 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator 9 | from .MSEEvaluator import MSEEvaluator 10 | from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame 11 | from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator 12 | from .SequentialEvaluator import SequentialEvaluator 13 | from .TranslationEvaluator import TranslationEvaluator 14 | from .TripletEvaluator import TripletEvaluator 15 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/losses/ContrastiveLoss.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Iterable, Dict 3 | 4 | import torch.nn.functional as F 5 | from torch import nn, Tensor 6 | 7 | from sentence_transformers.SentenceTransformer import SentenceTransformer 8 | 9 | 10 | class SiameseDistanceMetric(Enum): 11 | """ 12 | The metric for the contrastive loss 13 | """ 14 | EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2) 15 | MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1) 16 | COSINE_DISTANCE = lambda x, y: 1-F.cosine_similarity(x, y) 17 | 18 | 19 | class ContrastiveLoss(nn.Module): 20 | """ 21 | Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the 22 | two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased. 23 | 24 | Further information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf 25 | 26 | :param model: SentenceTransformer model 27 | :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used 28 | :param margin: Negative samples (label == 0) should have a distance of at least the margin value. 29 | :param size_average: Average by the size of the mini-batch. 30 | 31 | Example:: 32 | 33 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 34 | from sentence_transformers.readers import InputExample 35 | 36 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 37 | train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1), 38 | InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)] 39 | train_dataset = SentencesDataset(train_examples, model) 40 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 41 | train_loss = losses.ContrastiveLoss(model=model) 42 | 43 | """ 44 | 45 | def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5, size_average:bool = True): 46 | super(ContrastiveLoss, self).__init__() 47 | self.distance_metric = distance_metric 48 | self.margin = margin 49 | self.model = model 50 | self.size_average = size_average 51 | 52 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 53 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 54 | assert len(reps) == 2 55 | rep_anchor, rep_other = reps 56 | distances = self.distance_metric(rep_anchor, rep_other) 57 | losses = 0.5 * (labels.float() * distances.pow(2) + (1 - labels).float() * F.relu(self.margin - distances).pow(2)) 58 | return losses.mean() if self.size_average else losses.sum() 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/losses/CosineSimilarityLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | 6 | 7 | class CosineSimilarityLoss(nn.Module): 8 | """ 9 | CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label. 10 | 11 | It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two. 12 | By default, it minimizes the following loss: ||input_label - cos_score_transformation(cosine_sim(u,v))||_2. 13 | 14 | :param model: SentenceTranformer model 15 | :param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE: ||input_label - cosine_sim(u,v)||_2 16 | :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity. By default, the identify function is used (i.e. no change). 17 | 18 | Example:: 19 | 20 | from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses 21 | 22 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 23 | train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8), 24 | InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)] 25 | train_dataset = SentencesDataset(train_examples, model) 26 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 27 | train_loss = losses.CosineSimilarityLoss(model=model) 28 | 29 | 30 | """ 31 | def __init__(self, model: SentenceTransformer, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()): 32 | super(CosineSimilarityLoss, self).__init__() 33 | self.model = model 34 | self.loss_fct = loss_fct 35 | self.cos_score_transformation = cos_score_transformation 36 | 37 | 38 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 39 | embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 40 | output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1])) 41 | return self.loss_fct(output, labels.view(-1)) 42 | 43 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/losses/MSELoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | 5 | 6 | class MSELoss(nn.Module): 7 | """ 8 | Computes the MSE loss between the computed sentence embedding and a target sentence embedding. This loss 9 | is used when extending sentence embeddings to new languages as described in our publication 10 | Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation: https://arxiv.org/abs/2004.09813 11 | 12 | For an example, see the documentation on extending language models to new languages. 13 | """ 14 | def __init__(self, model): 15 | super(MSELoss, self).__init__() 16 | self.model = model 17 | self.loss_fct = nn.MSELoss() 18 | 19 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 20 | rep = self.model(sentence_features[0])['sentence_embedding'] 21 | return self.loss_fct(rep, labels) 22 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/losses/OnlineContrastiveLoss.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import torch.nn.functional as F 3 | from torch import nn, Tensor 4 | from .ContrastiveLoss import SiameseDistanceMetric 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer 6 | 7 | 8 | class OnlineContrastiveLoss(nn.Module): 9 | """ 10 | Online Contrastive loss. Similar to ConstrativeLoss, but it selects hard positive (positives that are far apart) 11 | and hard negative pairs (negatives that are close) and computes the loss only for these pairs. Often yields 12 | better performances than ConstrativeLoss. 13 | 14 | :param model: SentenceTransformer model 15 | :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used 16 | :param margin: Negative samples (label == 0) should have a distance of at least the margin value. 17 | :param size_average: Average by the size of the mini-batch. 18 | 19 | Example:: 20 | 21 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 22 | from sentence_transformers.readers import InputExample 23 | 24 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 25 | train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1), 26 | InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)] 27 | train_dataset = SentencesDataset(train_examples, model) 28 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 29 | train_loss = losses.OnlineContrastiveLoss(model=model) 30 | """ 31 | 32 | def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5): 33 | super(OnlineContrastiveLoss, self).__init__() 34 | self.model = model 35 | self.margin = margin 36 | self.distance_metric = distance_metric 37 | 38 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor, size_average=False): 39 | embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 40 | 41 | distance_matrix = self.distance_metric(embeddings[0], embeddings[1]) 42 | negs = distance_matrix[labels == 0] 43 | poss = distance_matrix[labels == 1] 44 | 45 | # select hard positive and hard negative pairs 46 | negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())] 47 | positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())] 48 | 49 | positive_loss = positive_pairs.pow(2).sum() 50 | negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum() 51 | loss = positive_loss + negative_loss 52 | return loss -------------------------------------------------------------------------------- /nlp/sentence_transformers/losses/TripletLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from enum import Enum 6 | from ..SentenceTransformer import SentenceTransformer 7 | 8 | class TripletDistanceMetric(Enum): 9 | """ 10 | The metric for the triplet loss 11 | """ 12 | COSINE = lambda x, y: 1 - F.cosine_similarity(x, y) 13 | EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2) 14 | MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1) 15 | 16 | class TripletLoss(nn.Module): 17 | """ 18 | This class implements triplet loss. Given a triplet of (anchor, positive, negative), 19 | the loss minimizes the distance between anchor and positive while it maximizes the distance 20 | between anchor and negative. It compute the following loss function: 21 | 22 | loss = max(||anchor - positive|| - ||anchor - negative|| + margin, 0). 23 | 24 | Margin is an important hyperparameter and needs to be tuned respectively. 25 | 26 | For further details, see: https://en.wikipedia.org/wiki/Triplet_loss 27 | 28 | :param model: SentenceTransformerModel 29 | :param distance_metric: Function to compute distance between two embeddings. The class TripletDistanceMetric contains common distance metrices that can be used. 30 | :param triplet_margin: The negative should be at least this much further away from the anchor than the positive. 31 | 32 | Example:: 33 | 34 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 35 | from sentence_transformers.readers import InputExample 36 | 37 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 38 | train_examples = [InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1']), 39 | InputExample(texts=['Anchor 2', 'Positive 2', 'Negative 2'])] 40 | train_dataset = SentencesDataset(train_examples, model) 41 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 42 | train_loss = losses.TripletLoss(model=model) 43 | """ 44 | def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin: float = 5): 45 | super(TripletLoss, self).__init__() 46 | self.model = model 47 | self.distance_metric = distance_metric 48 | self.triplet_margin = triplet_margin 49 | 50 | 51 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 52 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 53 | 54 | rep_anchor, rep_pos, rep_neg = reps 55 | distance_pos = self.distance_metric(rep_anchor, rep_pos) 56 | distance_neg = self.distance_metric(rep_anchor, rep_neg) 57 | 58 | losses = F.relu(distance_pos - distance_neg + self.triplet_margin) 59 | return losses.mean() -------------------------------------------------------------------------------- /nlp/sentence_transformers/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .CosineSimilarityLoss import * 2 | from .SoftmaxLoss import * 3 | from .AdvCLSoftmaxLoss import * 4 | from .MultipleNegativesRankingLoss import * 5 | from .TripletLoss import * 6 | from .MSELoss import * 7 | from .ContrastiveLoss import * 8 | from .OnlineContrastiveLoss import * 9 | from .MegaBatchMarginLoss import * 10 | 11 | # Triplet losses 12 | from .BatchHardTripletLoss import * 13 | from .BatchHardSoftMarginTripletLoss import * 14 | from .BatchSemiHardTripletLoss import * 15 | from .BatchAllTripletLoss import * 16 | from .SimSiamLoss import * 17 | from .SimCLRLoss import * -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/ALBERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class ALBERT(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/BERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class BERT(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/BoW.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | from .tokenizer import WhitespaceTokenizer 10 | 11 | class BoW(nn.Module): 12 | """Implements a Bag-of-Words (BoW) model to derive sentence embeddings. 13 | 14 | A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab. 15 | """ 16 | 17 | def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True): 18 | super(BoW, self).__init__() 19 | vocab = list(set(vocab)) #Ensure vocab is unique 20 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency'] 21 | self.vocab = vocab 22 | self.word_weights = word_weights 23 | self.unknown_word_weight = unknown_word_weight 24 | self.cumulative_term_frequency = cumulative_term_frequency 25 | 26 | #Maps wordIdx -> word weight 27 | self.weights = [] 28 | num_unknown_words = 0 29 | for word in vocab: 30 | weight = unknown_word_weight 31 | if word in word_weights: 32 | weight = word_weights[word] 33 | elif word.lower() in word_weights: 34 | weight = word_weights[word.lower()] 35 | else: 36 | num_unknown_words += 1 37 | self.weights.append(weight) 38 | 39 | logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 40 | 41 | self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) 42 | self.sentence_embedding_dimension = len(vocab) 43 | 44 | 45 | def forward(self, features: Dict[str, Tensor]): 46 | #Nothing to do, everything is done in get_sentence_features 47 | return features 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | return self.tokenizer.tokenize(text) 51 | 52 | def get_sentence_embedding_dimension(self): 53 | return self.sentence_embedding_dimension 54 | 55 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 56 | vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32) 57 | for token in tokens: 58 | if self.cumulative_term_frequency: 59 | vector[token] += self.weights[token] 60 | else: 61 | vector[token] = self.weights[token] 62 | 63 | return {'sentence_embedding': torch.tensor([vector], dtype=torch.float)} 64 | 65 | def get_config_dict(self): 66 | return {key: self.__dict__[key] for key in self.config_keys} 67 | 68 | def save(self, output_path): 69 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 70 | json.dump(self.get_config_dict(), fOut, indent=2) 71 | 72 | @staticmethod 73 | def load(input_path): 74 | with open(os.path.join(input_path, 'config.json')) as fIn: 75 | config = json.load(fIn) 76 | 77 | return BoW(**config) -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class CNN(nn.Module): 15 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 16 | 17 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]): 18 | nn.Module.__init__(self) 19 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 20 | self.in_word_embedding_dimension = in_word_embedding_dimension 21 | self.out_channels = out_channels 22 | self.kernel_sizes = kernel_sizes 23 | 24 | self.embeddings_dimension = out_channels*len(kernel_sizes) 25 | self.convs = nn.ModuleList() 26 | 27 | in_channels = in_word_embedding_dimension 28 | for kernel_size in kernel_sizes: 29 | padding_size = int((kernel_size - 1) / 2) 30 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 31 | padding=padding_size) 32 | self.convs.append(conv) 33 | 34 | def forward(self, features): 35 | token_embeddings = features['token_embeddings'] 36 | 37 | token_embeddings = token_embeddings.transpose(1, -1) 38 | vectors = [conv(token_embeddings) for conv in self.convs] 39 | out = torch.cat(vectors, 1).transpose(1, -1) 40 | 41 | features.update({'token_embeddings': out}) 42 | return features 43 | 44 | def get_word_embedding_dimension(self) -> int: 45 | return self.embeddings_dimension 46 | 47 | def tokenize(self, text: str) -> List[int]: 48 | raise NotImplementedError() 49 | 50 | def save(self, output_path: str): 51 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 52 | json.dump(self.get_config_dict(), fOut, indent=2) 53 | 54 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 55 | 56 | def get_config_dict(self): 57 | return {key: self.__dict__[key] for key in self.config_keys} 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 65 | model = CNN(**config) 66 | model.load_state_dict(weights) 67 | return model 68 | 69 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/CamemBERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | 4 | class CamemBERT(Transformer): 5 | """ 6 | DEPRECATED: Please use models.Transformer instead. 7 | """ 8 | pass 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/Dense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from torch import functional as F 5 | from typing import Union, Tuple, List, Iterable, Dict 6 | import os 7 | import json 8 | from ..util import fullname, import_from_string 9 | 10 | 11 | class Dense(nn.Module): 12 | """Feed-forward function with activiation function. 13 | 14 | This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN). 15 | 16 | :param in_features: Size of the input dimension 17 | :param out_features: Output size 18 | :param bias: Add a bias vector 19 | :param activation_function: Pytorch activation function applied on output 20 | """ 21 | def __init__(self, in_features: int, out_features: int, bias: bool = True, activation_function=nn.Tanh()): 22 | super(Dense, self).__init__() 23 | self.in_features = in_features 24 | self.out_features = out_features 25 | self.bias = bias 26 | self.activation_function = activation_function 27 | self.linear = nn.Linear(in_features, out_features, bias=bias) 28 | 29 | def forward(self, features: Dict[str, Tensor]): 30 | features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))}) 31 | return features 32 | 33 | def get_sentence_embedding_dimension(self) -> int: 34 | return self.out_features 35 | 36 | def save(self, output_path): 37 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 38 | json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut) 39 | 40 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 41 | 42 | @staticmethod 43 | def load(input_path): 44 | with open(os.path.join(input_path, 'config.json')) as fIn: 45 | config = json.load(fIn) 46 | 47 | config['activation_function'] = import_from_string(config['activation_function'])() 48 | model = Dense(**config) 49 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 50 | return model 51 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/DistilBERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class DistilBERT(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from typing import List 4 | import os 5 | import json 6 | 7 | 8 | 9 | class LSTM(nn.Module): 10 | """ 11 | Bidirectional LSTM running over word embeddings. 12 | """ 13 | def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True): 14 | nn.Module.__init__(self) 15 | self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.hidden_dim = hidden_dim 18 | self.num_layers = num_layers 19 | self.dropout = dropout 20 | self.bidirectional = bidirectional 21 | 22 | self.embeddings_dimension = hidden_dim 23 | if self.bidirectional: 24 | self.embeddings_dimension *= 2 25 | 26 | self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True) 27 | 28 | def forward(self, features): 29 | token_embeddings = features['token_embeddings'] 30 | sentence_lengths = torch.clamp(features['sentence_lengths'], min=1) 31 | 32 | packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False) 33 | packed = self.encoder(packed) 34 | unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0] 35 | features.update({'token_embeddings': unpack}) 36 | return features 37 | 38 | def get_word_embedding_dimension(self) -> int: 39 | return self.embeddings_dimension 40 | 41 | def tokenize(self, text: str) -> List[int]: 42 | raise NotImplementedError() 43 | 44 | def save(self, output_path: str): 45 | with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut: 46 | json.dump(self.get_config_dict(), fOut, indent=2) 47 | 48 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 49 | 50 | def get_config_dict(self): 51 | return {key: self.__dict__[key] for key in self.config_keys} 52 | 53 | @staticmethod 54 | def load(input_path: str): 55 | with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn: 56 | config = json.load(fIn) 57 | 58 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 59 | model = LSTM(**config) 60 | model.load_state_dict(weights) 61 | return model 62 | 63 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/MLP3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import os 4 | import json 5 | from typing import Union, Tuple, List, Iterable, Dict 6 | from torch import Tensor 7 | 8 | class MLP3(nn.Module): 9 | def __init__(self, hidden_dim=2048, norm=None, activation='relu'): 10 | super().__init__() 11 | ''' page 3 baseline setting 12 | Projection MLP. The projection MLP (in f) has BN ap- 13 | plied to each fully-connected (fc) layer, including its out- 14 | put fc. Its output fc has no ReLU. The hidden fc is 2048-d. 15 | This MLP has 3 layers. 16 | ''' 17 | self.config_keys = ['hidden_dim', 'norm', 'activation'] 18 | self.hidden_dim = hidden_dim 19 | self.norm = norm 20 | self.activation = activation 21 | 22 | if activation == "relu": 23 | activation_layer = nn.ReLU() 24 | elif activation == "leakyrelu": 25 | activation_layer = nn.LeakyReLU() 26 | elif activation == "tanh": 27 | activation_layer = nn.Tanh() 28 | elif activation == "sigmoid": 29 | activation_layer = nn.Sigmoid() 30 | else: 31 | raise ValueError(f"Unknown activation function {hidden_activation}") 32 | 33 | if norm: 34 | if norm=='bn': 35 | norm_layer = nn.BatchNorm1d 36 | else: 37 | norm_layer = nn.LayerNorm 38 | 39 | self.layer1 = nn.Sequential( 40 | nn.Linear(hidden_dim, hidden_dim), 41 | norm_layer(hidden_dim), 42 | nn.ReLU(inplace=True) 43 | ) 44 | self.layer2 = nn.Sequential( 45 | nn.Linear(hidden_dim, hidden_dim), 46 | norm_layer(hidden_dim), 47 | nn.ReLU(inplace=True) 48 | ) 49 | self.layer3 = nn.Sequential( 50 | nn.Linear(hidden_dim, hidden_dim), 51 | norm_layer(hidden_dim) 52 | ) 53 | else: 54 | self.layer1 = nn.Sequential( 55 | nn.Linear(hidden_dim, hidden_dim), 56 | nn.ReLU(inplace=True) 57 | ) 58 | self.layer2 = nn.Sequential( 59 | nn.Linear(hidden_dim, hidden_dim), 60 | nn.ReLU(inplace=True) 61 | ) 62 | self.layer3 = nn.Sequential( 63 | nn.Linear(hidden_dim, hidden_dim), 64 | ) 65 | 66 | self.num_layers = 3 67 | 68 | def set_layers(self, num_layers): 69 | self.num_layers = num_layers 70 | 71 | def forward(self, features: Dict[str, Tensor]): 72 | x = features["token_embeddings"] 73 | if self.num_layers == 3: 74 | x = self.layer1(x) 75 | x = self.layer2(x) 76 | x = self.layer3(x) 77 | elif self.num_layers == 2: 78 | x = self.layer1(x) 79 | x = self.layer3(x) 80 | else: 81 | raise Exception 82 | features["token_embeddings"] = x 83 | return features 84 | 85 | def get_config_dict(self): 86 | return {key: self.__dict__[key] for key in self.config_keys} 87 | 88 | def save(self, output_path): 89 | with open(os.path.join(output_path, 'mlp3_config.json'), 'w') as fOut: 90 | json.dump(self.get_config_dict(), fOut, indent=2) 91 | 92 | @staticmethod 93 | def load(input_path): 94 | with open(os.path.join(input_path, 'mlp3_config.json')) as fIn: 95 | config = json.load(fIn) 96 | 97 | return MLP3(**config) -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/Normalize.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from typing import Dict 4 | import torch.nn.functional as F 5 | 6 | class Normalize(nn.Module): 7 | """ 8 | This layer normalizes embeddings to unit length 9 | """ 10 | def __init__(self): 11 | super(Normalize, self).__init__() 12 | 13 | def forward(self, features: Dict[str, Tensor]): 14 | features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=1)}) 15 | return features 16 | 17 | def save(self, output_path): 18 | pass 19 | 20 | @staticmethod 21 | def load(input_path): 22 | return Normalize() 23 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/RoBERTa.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class RoBERTa(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/WeightedLayerPooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class WeightedLayerPooling(nn.Module): 10 | """ 11 | Token embeddings are weighted mean of their different hidden layer representations 12 | """ 13 | def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None): 14 | super(WeightedLayerPooling, self).__init__() 15 | self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.layer_start = layer_start 18 | self.num_hidden_layers = num_hidden_layers 19 | self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)) 20 | 21 | def forward(self, features: Dict[str, Tensor]): 22 | ft_all_layers = features['all_layer_embeddings'] 23 | 24 | all_layer_embedding = torch.stack(ft_all_layers) 25 | all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :] # Start from 4th layers output 26 | 27 | weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size()) 28 | weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum() 29 | 30 | features.update({'token_embeddings': weighted_average}) 31 | return features 32 | 33 | def get_word_embedding_dimension(self): 34 | return self.word_embedding_dimension 35 | 36 | def get_config_dict(self): 37 | return {key: self.__dict__[key] for key in self.config_keys} 38 | 39 | def save(self, output_path): 40 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 41 | json.dump(self.get_config_dict(), fOut, indent=2) 42 | 43 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 44 | 45 | 46 | @staticmethod 47 | def load(input_path): 48 | with open(os.path.join(input_path, 'config.json')) as fIn: 49 | config = json.load(fIn) 50 | 51 | model = WeightedLayerPooling(**config) 52 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 53 | return model 54 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/WordWeights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | 9 | class WordWeights(nn.Module): 10 | """This model can weight word embeddings, for example, with idf-values.""" 11 | 12 | def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1): 13 | """ 14 | 15 | :param vocab: 16 | Vocabulary of the tokenizer 17 | :param word_weights: 18 | Mapping of tokens to a float weight value. Words embeddings are multiplied by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values) 19 | :param unknown_word_weight: 20 | Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists. 21 | """ 22 | super(WordWeights, self).__init__() 23 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight'] 24 | self.vocab = vocab 25 | self.word_weights = word_weights 26 | self.unknown_word_weight = unknown_word_weight 27 | 28 | weights = [] 29 | num_unknown_words = 0 30 | for word in vocab: 31 | weight = unknown_word_weight 32 | if word in word_weights: 33 | weight = word_weights[word] 34 | elif word.lower() in word_weights: 35 | weight = word_weights[word.lower()] 36 | else: 37 | num_unknown_words += 1 38 | weights.append(weight) 39 | 40 | logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 41 | 42 | self.emb_layer = nn.Embedding(len(vocab), 1) 43 | self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)}) 44 | 45 | 46 | def forward(self, features: Dict[str, Tensor]): 47 | attention_mask = features['attention_mask'] 48 | token_embeddings = features['token_embeddings'] 49 | 50 | #Compute a weight value for each token 51 | token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1) 52 | token_weights = token_weights_raw * attention_mask.float() 53 | token_weights_sum = torch.sum(token_weights, 1) 54 | 55 | #Multiply embedding by token weight value 56 | token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size()) 57 | token_embeddings = token_embeddings * token_weights_expanded 58 | 59 | features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum}) 60 | return features 61 | 62 | def get_config_dict(self): 63 | return {key: self.__dict__[key] for key in self.config_keys} 64 | 65 | def save(self, output_path): 66 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 67 | json.dump(self.get_config_dict(), fOut, indent=2) 68 | 69 | @staticmethod 70 | def load(input_path): 71 | with open(os.path.join(input_path, 'config.json')) as fIn: 72 | config = json.load(fIn) 73 | 74 | return WordWeights(**config) -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/XLMRoBERTa.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class XLMRoBERTa(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/XLNet.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class XLNet(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .Transformer import Transformer 2 | from .ALBERT import ALBERT 3 | from .BERT import BERT 4 | from .BoW import BoW 5 | from .CNN import CNN 6 | from .CamemBERT import CamemBERT 7 | from .Dense import Dense 8 | from .DistilBERT import DistilBERT 9 | from .LSTM import LSTM 10 | from .Normalize import Normalize 11 | from .Pooling import Pooling 12 | from .RoBERTa import RoBERTa 13 | from .T5 import T5 14 | from .WKPooling import WKPooling 15 | from .WeightedLayerPooling import WeightedLayerPooling 16 | from .WordEmbeddings import WordEmbeddings 17 | from .WordWeights import WordWeights 18 | from .XLMRoBERTa import XLMRoBERTa 19 | from .XLNet import XLNet 20 | from .MLP3 import MLP3 21 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 7 | 8 | class WhitespaceTokenizer(WordTokenizer): 9 | """ 10 | Simple and fast white-space tokenizer. Splits sentence based on white spaces. 11 | Punctuation are stripped from tokens. 12 | """ 13 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False): 14 | self.stop_words = set(stop_words) 15 | self.do_lower_case = do_lower_case 16 | self.set_vocab(vocab) 17 | 18 | def get_vocab(self): 19 | return self.vocab 20 | 21 | def set_vocab(self, vocab: Iterable[str]): 22 | self.vocab = vocab 23 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 24 | 25 | def tokenize(self, text: str) -> List[int]: 26 | if self.do_lower_case: 27 | text = text.lower() 28 | 29 | tokens = text.split() 30 | 31 | tokens_filtered = [] 32 | for token in tokens: 33 | if token in self.stop_words: 34 | continue 35 | elif token in self.word2idx: 36 | tokens_filtered.append(self.word2idx[token]) 37 | continue 38 | 39 | token = token.strip(string.punctuation) 40 | if token in self.stop_words: 41 | continue 42 | elif len(token) > 0 and token in self.word2idx: 43 | tokens_filtered.append(self.word2idx[token]) 44 | continue 45 | 46 | token = token.lower() 47 | if token in self.stop_words: 48 | continue 49 | elif token in self.word2idx: 50 | tokens_filtered.append(self.word2idx[token]) 51 | continue 52 | 53 | return tokens_filtered 54 | 55 | def save(self, output_path: str): 56 | with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut: 57 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut) 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | return WhitespaceTokenizer(**config) 65 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/models/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 2 | from .WhitespaceTokenizer import WhitespaceTokenizer 3 | from .WhitespaceTokenizer import WhitespaceTokenizer -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/InputExample.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | 4 | class InputExample: 5 | """ 6 | Structure for one input example with texts, the label and a unique id 7 | """ 8 | def __init__(self, guid: str = '', texts: List[str] = None, texts_tokenized: List[List[int]] = None, label: Union[int, float] = 0): 9 | """ 10 | Creates one InputExample with the given texts, guid and label 11 | 12 | 13 | :param guid 14 | id for the example 15 | :param texts 16 | the texts for the example. Note, str.strip() is called on the texts 17 | :param texts_tokenized 18 | Optional: Texts that are already tokenized. If texts_tokenized is passed, texts must not be passed. 19 | :param label 20 | the label for the example 21 | """ 22 | self.guid = guid 23 | self.texts = [text.strip() for text in texts] if texts is not None else texts 24 | self.texts_tokenized = texts_tokenized 25 | self.label = label 26 | 27 | def __str__(self): 28 | return " label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/LabelSentenceReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class LabelSentenceReader: 7 | """Reads in a file that has at least two columns: a label and a sentence. 8 | This reader can for example be used with the BatchHardTripletLoss. 9 | Maps labels automatically to integers""" 10 | def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator='\t'): 11 | self.folder = folder 12 | self.label_map = {} 13 | self.label_col_idx = label_col_idx 14 | self.sentence_col_idx = sentence_col_idx 15 | self.separator = separator 16 | 17 | def get_examples(self, filename, max_examples=0): 18 | examples = [] 19 | 20 | id = 0 21 | for line in open(os.path.join(self.folder, filename), encoding="utf-8"): 22 | splits = line.strip().split(self.separator) 23 | label = splits[self.label_col_idx] 24 | sentence = splits[self.sentence_col_idx] 25 | 26 | if label not in self.label_map: 27 | self.label_map[label] = len(self.label_map) 28 | 29 | label_id = self.label_map[label] 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence], label=label_id)) 33 | 34 | if 0 < max_examples <= id: 35 | break 36 | 37 | return examples 38 | -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/NLIDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | 7 | class NLIDataReader(object): 8 | """ 9 | Reads in the Stanford NLI dataset and the MultiGenre NLI dataset 10 | """ 11 | def __init__(self, dataset_folder): 12 | self.dataset_folder = dataset_folder 13 | 14 | def get_examples(self, filename, max_examples=0): 15 | """ 16 | data_splits specified which data split to use (train, dev, test). 17 | Expects that self.dataset_folder contains the files s1.$data_split.gz, s2.$data_split.gz, 18 | labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz 19 | """ 20 | s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename), 21 | mode="rt", encoding="utf-8").readlines() 22 | s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename), 23 | mode="rt", encoding="utf-8").readlines() 24 | labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename), 25 | mode="rt", encoding="utf-8").readlines() 26 | 27 | examples = [] 28 | id = 0 29 | for sentence_a, sentence_b, label in zip(s1, s2, labels): 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label))) 33 | 34 | if 0 < max_examples <= len(examples): 35 | break 36 | 37 | return examples 38 | 39 | @staticmethod 40 | def get_labels(): 41 | return {"contradiction": 0, "entailment": 1, "neutral": 2} 42 | 43 | def get_num_labels(self): 44 | return len(self.get_labels()) 45 | 46 | def map_label(self, label): 47 | return self.get_labels()[label.strip().lower()] -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/PairedFilesReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | import gzip 6 | 7 | class PairedFilesReader(object): 8 | """ 9 | Reads in the a Pair Dataset, split in two files 10 | """ 11 | def __init__(self, filepaths): 12 | self.filepaths = filepaths 13 | 14 | 15 | def get_examples(self, max_examples=0): 16 | """ 17 | """ 18 | fIns = [] 19 | for filepath in self.filepaths: 20 | fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8') 21 | fIns.append(fIn) 22 | 23 | examples = [] 24 | 25 | eof = False 26 | while not eof: 27 | texts = [] 28 | for fIn in fIns: 29 | text = fIn.readline() 30 | 31 | if text == '': 32 | eof = True 33 | break 34 | 35 | texts.append(text) 36 | 37 | if eof: 38 | break; 39 | 40 | examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1)) 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/STSDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class STSDataReader: 7 | """ 8 | Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx) 9 | 10 | Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1 11 | """ 12 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", 13 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 14 | self.dataset_folder = dataset_folder 15 | self.score_col_idx = score_col_idx 16 | self.s1_col_idx = s1_col_idx 17 | self.s2_col_idx = s2_col_idx 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | self.normalize_scores = normalize_scores 21 | self.min_score = min_score 22 | self.max_score = max_score 23 | 24 | def get_examples(self, filename, max_examples=0): 25 | """ 26 | filename specified which data split to use (train.csv, dev.csv, test.csv). 27 | """ 28 | filepath = os.path.join(self.dataset_folder, filename) 29 | with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn: 30 | data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) 31 | examples = [] 32 | for id, row in enumerate(data): 33 | score = float(row[self.score_col_idx]) 34 | if self.normalize_scores: # Normalize to a 0...1 value 35 | score = (score - self.min_score) / (self.max_score - self.min_score) 36 | 37 | s1 = row[self.s1_col_idx] 38 | s2 = row[self.s2_col_idx] 39 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score)) 40 | 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples 45 | 46 | class STSBenchmarkDataReader(STSDataReader): 47 | """ 48 | Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4. 49 | Scores are normalized from 0...5 to 0...1 50 | """ 51 | def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t", 52 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 53 | super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter=delimiter, 54 | quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score) -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/TripletReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class TripletReader(object): 7 | """ 8 | Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1), 9 | one positive example (s2) and one negative example (s3) 10 | """ 11 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t", 12 | quoting=csv.QUOTE_NONE): 13 | self.dataset_folder = dataset_folder 14 | self.s1_col_idx = s1_col_idx 15 | self.s2_col_idx = s2_col_idx 16 | self.s3_col_idx = s3_col_idx 17 | self.has_header = has_header 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | 21 | def get_examples(self, filename, max_examples=0): 22 | """ 23 | 24 | """ 25 | data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter, 26 | quoting=self.quoting) 27 | examples = [] 28 | if self.has_header: 29 | next(data) 30 | 31 | for id, row in enumerate(data): 32 | s1 = row[self.s1_col_idx] 33 | s2 = row[self.s2_col_idx] 34 | s3 = row[self.s3_col_idx] 35 | 36 | examples.append(InputExample(texts=[s1, s2, s3])) 37 | if max_examples > 0 and len(examples) >= max_examples: 38 | break 39 | 40 | return examples -------------------------------------------------------------------------------- /nlp/sentence_transformers/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .InputExample import InputExample 2 | from .LabelSentenceReader import LabelSentenceReader 3 | from .NLIDataReader import NLIDataReader 4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader 5 | from .TripletReader import TripletReader -------------------------------------------------------------------------------- /nlp/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__ 6 | Author: czh 7 | Create Date: 2021/8/12 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/tools/accelerate_tracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/9/21 14:20 7 | """ 8 | import wandb 9 | from accelerate.tracking import GeneralTracker 10 | from accelerate.logging import get_logger 11 | from typing import Optional 12 | 13 | logger = get_logger(__name__) 14 | 15 | 16 | class CustomWandbTracker(GeneralTracker): 17 | name = "wandb" 18 | requires_logging_directory = False 19 | 20 | def __init__(self, run_name: str, **kwargs): 21 | self.run_name = run_name 22 | 23 | self.run = wandb.init(name=self.run_name, **kwargs) 24 | logger.info(f"Initialized WandB project {self.run_name}") 25 | logger.info( 26 | "Make sure to log any initial configurations with `self.store_init_configuration` before training!" 27 | ) 28 | 29 | @property 30 | def tracker(self): 31 | return self.run.run 32 | 33 | def store_init_configuration(self, values: dict): 34 | """ 35 | Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. 36 | 37 | Args: 38 | values (Dictionary `str` to `bool`, `str`, `float` or `int`): 39 | Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`, 40 | `str`, `float`, `int`, or `None`. 41 | """ 42 | wandb.config.update(values) 43 | logger.info("Stored initial configuration hyperparameters to WandB") 44 | 45 | def log(self, values: dict, step: Optional[int], **kwargs): 46 | """ 47 | Logs `values` to the current run. 48 | 49 | Args: 50 | values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`): 51 | Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of 52 | `str` to `float`/`int`. 53 | step (`int`, *optional*): 54 | The run step. If included, the log will be affiliated with this step. 55 | """ 56 | wandb.log(values, step=step, **kwargs) 57 | 58 | def finish(self): 59 | """ 60 | Closes `wandb` writer 61 | """ 62 | self.run.finish() 63 | logger.info("WandB run closed") 64 | -------------------------------------------------------------------------------- /nlp/tools/convert_nezha_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: convert_nezha_original_tf_checkpoint_to_pytorch 6 | Author: czh 7 | Create Date: 2021/8/18 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | # Convert ALBERT checkpoint. 13 | import argparse 14 | import logging 15 | import torch 16 | from nlp.models.nezha import NeZhaConfig, NeZhaForPreTraining, load_tf_weights_in_nezha 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | 20 | 21 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, nezha_config_file, pytorch_dump_path): 22 | # Initialise PyTorch model 23 | config = NeZhaConfig.from_json_file(nezha_config_file) 24 | print("Building PyTorch model from configuration: {}".format(str(config))) 25 | model = NeZhaForPreTraining(config) 26 | # Load weights from tf checkpoint 27 | load_tf_weights_in_nezha(model, tf_checkpoint_path) 28 | # Save pytorch-model 29 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 30 | state_dict = {k: v for k, v in model.state_dict().items() if 'relative_positions' not in k} 31 | torch.save(state_dict, pytorch_dump_path) 32 | 33 | 34 | if __name__ == "__main__": 35 | parser = argparse.ArgumentParser() 36 | # Required parameters 37 | parser.add_argument( 38 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 39 | ) 40 | parser.add_argument( 41 | "--nezha_config_file", 42 | default=None, 43 | type=str, 44 | required=True, 45 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 46 | "This specifies the model architecture.", 47 | ) 48 | parser.add_argument( 49 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 50 | ) 51 | args = parser.parse_args() 52 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.nezha_config_file, args.pytorch_dump_path) 53 | 54 | 55 | ''' 56 | python convert_nezha_original_tf_checkpoint_to_pytorch.py \ 57 | --tf_checkpoint_path=./pretrained_models/nezha-large-www \ 58 | --nezha_config_file=./pretrained_models/nezha-large-www/config.json \ 59 | --pytorch_dump_path=./pretrained_models/nezha-large-www/pytorch_model.bin 60 | ''' 61 | -------------------------------------------------------------------------------- /nlp/tools/convert_tf_to_pytorch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/11/11 15:12 7 | """ 8 | from transformers.models.bert.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 9 | 10 | # chinese_wobert_plus 11 | path = "/Users/chenzhihao/Downloads/chinese_wobert_plus_L-12_H-768_A-12" 12 | tf_checkpoint_path = path + "/bert_model.ckpt" 13 | bert_config_file = path + "/bert_config.json" 14 | pytorch_dump_path = "/Users/chenzhihao/Downloads/chinese_wobert_plus/pytorch_model.bin" 15 | 16 | convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, 17 | pytorch_dump_path) 18 | 19 | # chinese_wobert 20 | path = "/Users/chenzhihao/Downloads/chinese_wobert_L-12_H-768_A-12" 21 | tf_checkpoint_path = path + "/bert_model.ckpt" 22 | bert_config_file = path + "/bert_config.json" 23 | pytorch_dump_path = "/Users/chenzhihao/Downloads/chinese_wobert_base/pytorch_model.bin" 24 | 25 | convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, 26 | pytorch_dump_path) 27 | -------------------------------------------------------------------------------- /nlp/tools/dataloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: dataloader 6 | Author: czh 7 | Create Date: 2021/9/30 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | from torch.utils.data.dataloader import _SingleProcessDataLoaderIter, _MultiProcessingDataLoaderIter 13 | import random 14 | from torch.utils.data import Dataset, DataLoader 15 | from itertools import chain 16 | 17 | 18 | class BlockShuffleDataLoader(DataLoader): 19 | def __init__(self, dataset: Dataset, sort_key, sort_bs_num=None, is_shuffle=True, **kwargs): 20 | """ 21 | 初始化函数,继承DataLoader类 22 | Args: 23 | dataset: Dataset类的实例,其中中必须包含dataset变量,并且该变量为一个list 24 | sort_key: 排序函数,即使用dataset元素中哪一个变量的长度进行排序 25 | sort_bs_num: 排序范围,即在多少个batch_size大小内进行排序,默认为None,表示对整个序列排序 26 | is_shuffle: 是否对分块后的内容,进行随机打乱,默认为True 27 | **kwargs: 28 | """ 29 | assert isinstance(dataset.data_set, list), "dataset为Dataset类的实例,其中中必须包含dataset变量,并且该变量为一个list" 30 | super().__init__(dataset, **kwargs) 31 | self.sort_bs_num = sort_bs_num 32 | self.sort_key = sort_key 33 | self.is_shuffle = is_shuffle 34 | 35 | def __iter__(self): 36 | self.dataset.data_set = self.block_shuffle(self.dataset.data_set, self.batch_size, self.sort_bs_num, 37 | self.sort_key, self.is_shuffle) 38 | if self.num_workers == 0: 39 | return _SingleProcessDataLoaderIter(self) 40 | else: 41 | return _MultiProcessingDataLoaderIter(self) 42 | 43 | @staticmethod 44 | def block_shuffle(data, batch_size, sort_bs_num, sort_key, is_shuffle): 45 | random.shuffle(data) 46 | # 将数据按照batch_size大小进行切分 47 | tail_data = [] if len(data) % batch_size == 0 else data[-len(data) % batch_size:] 48 | data = data[:len(data) - len(tail_data)] 49 | assert len(data) % batch_size == 0 50 | # 获取真实排序范围 51 | sort_bs_num = len(data) // batch_size if sort_bs_num is None else sort_bs_num 52 | # 按照排序范围进行数据划分 53 | data = [data[i:i + sort_bs_num * batch_size] for i in range(0, len(data), sort_bs_num * batch_size)] 54 | # 在排序范围,根据排序函数进行降序排列 55 | data = [sorted(i, key=sort_key, reverse=True) for i in data] 56 | # 将数据根据batch_size获取batch_data 57 | data = list(chain(*data)) 58 | data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)] 59 | # 判断是否需要对batch_data序列进行打乱 60 | if is_shuffle: 61 | random.shuffle(data) 62 | # 将tail_data填补回去 63 | data = list(chain(*data)) + tail_data 64 | return data 65 | -------------------------------------------------------------------------------- /nlp/tools/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.metrics import confusion_matrix 4 | plt.switch_backend('agg') 5 | 6 | 7 | def plot_confusion_matrix(y_true, y_pred, classes, 8 | save_path, normalize=False, title=None, 9 | cmap=plt.cm.Blues): 10 | """ 11 | This function prints and plots the confusion matrix. 12 | Normalization can be applied by setting `normalize=True`. 13 | """ 14 | if not title: 15 | if normalize: 16 | title = 'Normalized confusion matrix' 17 | else: 18 | title = 'Confusion matrix, without normalization' 19 | # Compute confusion matrix 20 | cm = confusion_matrix(y_true=y_true, y_pred=y_pred) 21 | # Only use the labels that appear in the data 22 | if normalize: 23 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 24 | print("Normalized confusion matrix") 25 | else: 26 | print('Confusion matrix, without normalization') 27 | # --- plot--- # 28 | plt.rcParams['savefig.dpi'] = 200 29 | plt.rcParams['figure.dpi'] = 200 30 | plt.rcParams['figure.figsize'] = [20, 20] # plot 31 | plt.rcParams.update({'font.size': 10}) 32 | fig, ax = plt.subplots() 33 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap) 34 | # --- bar --- # 35 | from mpl_toolkits.axes_grid1 import make_axes_locatable 36 | divider = make_axes_locatable(ax) 37 | cax = divider.append_axes("right", size="5%", pad=0.05) 38 | plt.colorbar(im, cax=cax) 39 | # --- bar --- # 40 | # ax.figure.colorbar(im, ax=ax) 41 | # We want to show all ticks... 42 | ax.set(xticks=np.arange(cm.shape[1]), 43 | yticks=np.arange(cm.shape[0]), 44 | # ... and label them with the respective list entries 45 | xticklabels=classes, yticklabels=classes, 46 | title=title, 47 | ylabel='True label', 48 | xlabel='Predicted label') 49 | 50 | # Rotate the tick labels and set their alignment. 51 | plt.setp(ax.get_xticklabels(), rotation=45, ha="right", 52 | rotation_mode="anchor") 53 | # Loop over data dimensions and create text annotations. 54 | fmt = '.2f' if normalize else 'd' 55 | thresh = cm.max() / 2. 56 | for i in range(cm.shape[0]): 57 | for j in range(cm.shape[1]): 58 | ax.text(j, i, format(cm[i, j], fmt), 59 | ha="center", va="center", 60 | color="white" if cm[i, j] > thresh else "black") 61 | fig.tight_layout() 62 | plt.savefig(save_path) 63 | 64 | 65 | # if __name__ == "__main__": 66 | # y_true = ['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] 67 | # y_pred = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O','B-PER', 'I-PER', 'O'] 68 | # classes = ['O','B-MISC', 'I-MISC','B-PER', 'I-PER'] 69 | # save_path = './ner_confusion_matrix.png' 70 | # plot_confusion_matrix(y_true,y_pred,classes,save_path) 71 | -------------------------------------------------------------------------------- /nlp/trainers/ChildTuningF.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: ChildTuningF 6 | Author: czh 7 | Create Date: 2021/11/11 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | # https://github.com/alibaba/AliceMind/tree/main/ChildTuning 13 | from transformers import Trainer 14 | from transformers.optimization import get_scheduler 15 | 16 | from nlp.callback.optimizers.child_tuning_optimizer import ChildTuningAdamW 17 | 18 | 19 | class ChildTuningFTrainer(Trainer): 20 | def __init__(self, **kwargs): 21 | self.reserve_p = kwargs.pop('reserve_p') 22 | self.mode = kwargs.pop('mode') 23 | super().__init__(**kwargs) 24 | 25 | def create_optimizer_and_scheduler(self, num_training_steps: int): 26 | """ 27 | Setup the optimizer and the learning rate scheduler. 28 | 29 | We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the 30 | Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. 31 | """ 32 | if self.optimizer is None: 33 | no_decay = ["bias", "LayerNorm.weight"] 34 | optimizer_grouped_parameters = [ 35 | { 36 | "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 37 | "weight_decay": self.args.weight_decay, 38 | }, 39 | { 40 | "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 41 | "weight_decay": 0.0, 42 | }, 43 | ] 44 | optimizer_cls = ChildTuningAdamW 45 | optimizer_kwargs = {"betas": (self.args.adam_beta1, self.args.adam_beta2), "eps": self.args.adam_epsilon, 46 | "lr": self.args.learning_rate} 47 | self.optimizer = optimizer_cls(optimizer_grouped_parameters, reserve_p=self.reserve_p, # noqa 48 | mode=self.mode, **optimizer_kwargs) 49 | 50 | if self.lr_scheduler is None: 51 | self.lr_scheduler = get_scheduler( # noqa 52 | self.args.lr_scheduler_type, 53 | self.optimizer, 54 | num_warmup_steps=self.args.warmup_steps, 55 | num_training_steps=num_training_steps, 56 | ) 57 | -------------------------------------------------------------------------------- /nlp/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__.py 6 | Author: czh 7 | Create Date: 2021/11/11 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: __init__ 6 | Author: czh 7 | Create Date: 2021/8/12 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | -------------------------------------------------------------------------------- /nlp/utils/bert_or_thesues_repalcement_scheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: bert_or_thesues_repalcement_scheduler 6 | Author: czh 7 | Create Date: 2021/8/12 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | # bert_of_thesues 的replacement scheduler 13 | from nlp.models.distill_model import BertThesuesEncoder 14 | 15 | 16 | class ConstantReplacementScheduler: 17 | def __init__(self, bert_encoder: BertThesuesEncoder, replacing_rate, replacing_steps=None): 18 | self.bert_encoder = bert_encoder 19 | self.replacing_rate = replacing_rate 20 | self.replacing_steps = replacing_steps 21 | self.step_counter = 0 22 | self.bert_encoder.set_replacing_rate(replacing_rate) 23 | 24 | def step(self): 25 | self.step_counter += 1 26 | if self.replacing_steps is None or self.replacing_rate == 1.0: 27 | return self.replacing_rate 28 | else: 29 | if self.step_counter >= self.replacing_steps: 30 | self.bert_encoder.set_replacing_rate(1.0) 31 | self.replacing_rate = 1.0 32 | return self.replacing_rate 33 | 34 | 35 | class LinearReplacementScheduler: 36 | def __init__(self, bert_encoder: BertThesuesEncoder, base_replacing_rate, k): 37 | self.bert_encoder = bert_encoder 38 | self.base_replacing_rate = base_replacing_rate 39 | self.step_counter = 0 40 | self.k = k 41 | self.bert_encoder.set_replacing_rate(base_replacing_rate) 42 | 43 | def step(self): 44 | self.step_counter += 1 45 | current_replacing_rate = min(self.k * self.step_counter + self.base_replacing_rate, 1.0) 46 | self.bert_encoder.set_replacing_rate(current_replacing_rate) 47 | return current_replacing_rate 48 | -------------------------------------------------------------------------------- /nlp/utils/enums.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | from enum import Enum 4 | 5 | 6 | class CaseNotSensitiveEnum(Enum): 7 | """大小写不敏感""" 8 | 9 | @classmethod 10 | def _missing_(cls, name): 11 | for member in cls: 12 | if member.name.lower() == name.lower(): 13 | return member 14 | 15 | @classmethod 16 | def choices(cls): 17 | return [k.value for k in list(cls)] 18 | 19 | 20 | class RunMode(Enum): 21 | """ 22 | 因为crf的计算loss和infer的逻辑是分开的,为了保证有些操作能够以更优的方式进行: 23 | 1. 训练时,只关注loss,train模式即可 24 | 2. validation阶段需要打出val_loss和预测的metrics,所以会俩部分都需要。采用eval模式 25 | 3. 模型训练好使用时,其实只需要预测,不需要loss计算,采用infer 26 | """ 27 | TRAIN = "train" 28 | INFER = "infer" 29 | EVAL = "eval" 30 | 31 | 32 | class DataType(Enum): 33 | TRAIN = "train" 34 | EVAL = "dev" 35 | TEST = "test" 36 | 37 | 38 | class OptimizerEnum(CaseNotSensitiveEnum): 39 | AdamW = "AdamW" 40 | LAMB = "LAMB" 41 | Adafactor = "Adafactor" 42 | Adam = "Adam" 43 | 44 | 45 | class FP16OptLevel(CaseNotSensitiveEnum): 46 | O1 = "O1" 47 | O2 = "O2" 48 | O3 = "O3" 49 | O4 = "O4" 50 | 51 | 52 | class MatcherType(CaseNotSensitiveEnum): 53 | AVG = "avg" 54 | MIN = "min" 55 | -------------------------------------------------------------------------------- /nlp/utils/errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | """ 3 | ====================================== 4 | Project Name: NLP 5 | File Name: errors 6 | Author: czh 7 | Create Date: 2022/2/9 8 | -------------------------------------- 9 | Change Activity: 10 | ====================================== 11 | """ 12 | 13 | 14 | class ParseSpanError(Exception): 15 | pass 16 | 17 | 18 | class ParseEntityOffsetMappingError(ParseSpanError): 19 | pass 20 | 21 | 22 | class EntityNumNotMatchError(ParseSpanError): 23 | pass 24 | -------------------------------------------------------------------------------- /nlp/utils/factory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | 4 | from dataclasses import asdict 5 | from functools import partial 6 | from typing import Callable, List, Optional 7 | 8 | from pydantic import dataclasses 9 | 10 | 11 | @dataclasses.dataclass 12 | class BaseClass: 13 | 14 | def as_dict(self): 15 | return asdict(self) 16 | 17 | 18 | @dataclasses.dataclass 19 | class GoldEntity(BaseClass): 20 | start_index: int 21 | end_index: int 22 | 23 | 24 | @dataclasses.dataclass 25 | class PredEntity(GoldEntity): 26 | start_prob: float 27 | end_prob: float 28 | 29 | 30 | @dataclasses.dataclass 31 | class PredRelation(BaseClass): 32 | rel: int 33 | rel_prob: float 34 | 35 | 36 | @dataclasses.dataclass 37 | class PredTuple(BaseClass): 38 | rel: int 39 | rel_prob: float 40 | ents: List[Optional[PredEntity]] 41 | 42 | 43 | @dataclasses.dataclass 44 | class GoldTuple(BaseClass): 45 | rel: int 46 | ents: List[GoldEntity] 47 | 48 | 49 | class PartialWrapper: 50 | """partial的类别封装,主要是将kwarg作为class的属性来处理""" 51 | 52 | def __init__(self, func: Callable, *args, **kwargs): 53 | assert isinstance(func, Callable) 54 | self.func = partial(func, *args, **kwargs) 55 | for k, v in kwargs.items(): 56 | self.__setattr__(k, v) 57 | 58 | def __call__(self, *args, **kwargs): 59 | return self.func(*args, **kwargs) 60 | 61 | 62 | class PydanticConfig: 63 | arbitrary_types_allowed = True 64 | -------------------------------------------------------------------------------- /nlp/utils/generate_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: zhihao.chen@kuwo.cn 6 | @date: 2022/7/29 15:44 7 | """ 8 | import torch 9 | from torch.nn import functional as nnf 10 | 11 | 12 | def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')): 13 | """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering 14 | Args: 15 | :param logits: logits distribution shape (vocabulary size) 16 | :param top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. 17 | :param top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset 18 | whose total probability mass is greater than or equal to the threshold top_p. 19 | In practice, we select the highest probability tokens whose cumulative probability mass exceeds 20 | the threshold top_p. 21 | :param threshold: a minimal threshold to keep logits 22 | :param filter_value: 23 | """ 24 | assert logits.dim() == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code 25 | top_k = min(top_k, logits.size(-1)) 26 | if top_k > 0: 27 | # Remove all tokens with a probability less than the last token in the top-k tokens 28 | indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] 29 | logits[indices_to_remove] = filter_value 30 | 31 | if top_p > 0.0: 32 | # Compute cumulative probabilities of sorted tokens 33 | sorted_logits, sorted_indices = torch.sort(logits, descending=True) 34 | cumulative_probabilities = torch.cumsum(nnf.softmax(sorted_logits, dim=-1), dim=-1) 35 | 36 | # Remove tokens with cumulative probability above the threshold 37 | sorted_indices_to_remove = cumulative_probabilities > top_p 38 | # Shift the indices to the right to keep also the first token above the threshold 39 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() 40 | sorted_indices_to_remove[..., 0] = 0 41 | 42 | # Back to unsorted indices and set them to -infinity 43 | indices_to_remove = sorted_indices[sorted_indices_to_remove] 44 | logits[indices_to_remove] = filter_value 45 | 46 | indices_to_remove = logits < threshold 47 | logits[indices_to_remove] = filter_value 48 | 49 | return logits 50 | -------------------------------------------------------------------------------- /nlp/utils/selection_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: zhihao.chen@kuwo.cn 6 | @date: 2022/9/21 18:03 7 | """ 8 | # 用于存储和加载training dynamic 9 | # 参考自:https://github.com/allenai/cartography/blob/main/cartography/selection/selection_utils.py 10 | import json 11 | import logging 12 | import os 13 | import pandas as pd 14 | import tqdm 15 | 16 | from typing import List 17 | 18 | logging.basicConfig( 19 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO 20 | ) 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def log_training_dynamics(output_dir: os.path, 25 | epoch: int, 26 | train_ids: List[int], 27 | train_logits: List[List[float]], 28 | train_golds: List[int]): 29 | """ 30 | Save training dynamics (logits) from given epoch as records of a `.jsonl` file. 31 | """ 32 | td_df = pd.DataFrame({"guid": train_ids, 33 | f"logits_epoch_{epoch}": train_logits, 34 | "gold": train_golds}) 35 | 36 | logging_dir = os.path.join(output_dir, f"training_dynamics") 37 | # Create directory for logging training dynamics, if it doesn't already exist. 38 | if not os.path.exists(logging_dir): 39 | os.makedirs(logging_dir) 40 | epoch_file_name = os.path.join(logging_dir, f"dynamics_epoch_{epoch}.json") 41 | td_df.to_json(epoch_file_name, lines=True, orient="records") 42 | logger.info(f"Training Dynamics logged to {epoch_file_name}") 43 | 44 | 45 | def read_training_dynamics(model_dir: os.path, 46 | strip_last: bool = False, 47 | id_field: str = "guid", 48 | burn_out: int = None): 49 | """ 50 | Given path to logged training dynamics, merge stats across epochs. 51 | Returns: 52 | - Dict between ID of a train instances and its gold label, and the list of logits across epochs. 53 | """ 54 | train_dynamics = {} 55 | 56 | td_dir = os.path.join(model_dir, "training_dynamics") 57 | num_epochs = len([f for f in os.listdir(td_dir) if os.path.isfile(os.path.join(td_dir, f))]) 58 | if burn_out: 59 | num_epochs = burn_out 60 | 61 | logger.info(f"Reading {num_epochs} files from {td_dir} ...") 62 | for epoch_num in tqdm.tqdm(range(num_epochs)): 63 | epoch_file = os.path.join(td_dir, f"dynamics_epoch_{epoch_num}.json") 64 | assert os.path.exists(epoch_file) 65 | 66 | with open(epoch_file, "r", encoding='utf8') as infile: 67 | for line in infile: 68 | line = line.strip() 69 | if not line: 70 | continue 71 | record = json.loads(line.strip()) 72 | guid = record[id_field] if not strip_last else record[id_field][:-1] 73 | if guid not in train_dynamics: 74 | assert epoch_num == 0 75 | train_dynamics[guid] = {"gold": record["gold"], "logits": []} 76 | train_dynamics[guid]["logits"].append(record[f"logits_epoch_{epoch_num}"]) 77 | 78 | logger.info(f"Read training dynamics for {len(train_dynamics)} train instances.") 79 | return train_dynamics 80 | -------------------------------------------------------------------------------- /nlp/utils/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/11/3 17:39 7 | """ 8 | import torch.nn as nn 9 | 10 | 11 | # Code widely inspired from: 12 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. 13 | def masked_softmax(tensor, mask): 14 | """ 15 | Apply a masked softmax on the last dimension of a tensor. 16 | The input tensor and mask should be of size (batch, *, sequence_length). 17 | Args: 18 | tensor: The tensor on which the softmax function must be applied along 19 | the last dimension. 20 | mask: A mask of the same size as the tensor with 0s in the positions of 21 | the values that must be masked and 1s everywhere else. 22 | Returns: 23 | A tensor of the same size as the inputs containing the result of the 24 | softmax. 25 | """ 26 | tensor_shape = tensor.size() 27 | reshaped_tensor = tensor.view(-1, tensor_shape[-1]) 28 | 29 | # Reshape the mask so it matches the size of the input tensor. 30 | while mask.dim() < tensor.dim(): 31 | mask = mask.unsqueeze(1) 32 | mask = mask.expand_as(tensor).contiguous().float() 33 | reshaped_mask = mask.view(-1, mask.size()[-1]) 34 | 35 | result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1) 36 | result = result * reshaped_mask 37 | # 1e-13 is added to avoid divisions by zero. 38 | result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) 39 | 40 | return result.view(*tensor_shape) 41 | 42 | 43 | # Code widely inspired from: 44 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. 45 | def weighted_sum(tensor, weights, mask): 46 | """ 47 | Apply a weighted sum on the vectors along the last dimension of 'tensor', 48 | and mask the vectors in the result with 'mask'. 49 | Args: 50 | tensor: A tensor of vectors on which a weighted sum must be applied. 51 | weights: The weights to use in the weighted sum. 52 | mask: A mask to apply on the result of the weighted sum. 53 | Returns: 54 | A new tensor containing the result of the weighted sum after the mask 55 | has been applied on it. 56 | """ 57 | weight_sum = weights.bmm(tensor) 58 | 59 | while mask.dim() < weight_sum.dim(): 60 | mask = mask.unsqueeze(1) 61 | mask = mask.transpose(-1, -2) 62 | mask = mask.expand_as(weight_sum).contiguous().float() 63 | 64 | return weight_sum * mask 65 | -------------------------------------------------------------------------------- /nlp/utils/vat_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/10/19 16:04 7 | """ 8 | # https://github.com/amazon-research/sentence-representations/blob/main/VaSCL/learners/vat_utils.py 9 | import contextlib 10 | import torch 11 | import torch.nn as nn 12 | 13 | 14 | @contextlib.contextmanager 15 | def _disable_tracking_bn_stats(model): 16 | def switch_attr(m): 17 | if hasattr(m, 'track_running_stats'): 18 | m.track_running_stats ^= True 19 | 20 | model.apply(switch_attr) 21 | yield 22 | model.apply(switch_attr) 23 | 24 | 25 | def _l2_normalize(d, attention_mask=None): 26 | if attention_mask is not None: 27 | attention_mask = attention_mask.unsqueeze(-1) 28 | d *= attention_mask 29 | d_reshaped = d.view(d.shape[0], -1, *(1 for _ in range(d.dim() - 2))) 30 | d /= torch.norm(d_reshaped, dim=1, keepdim=True) + 1e-8 31 | # print("_l2_normalize, BEFORE:{} \t AFTER:{}".format(d.size(), d_reshaped.size())) 32 | return d 33 | 34 | 35 | def _emb_norm(emb): 36 | e_reshaped = emb.view(emb.shape[0], -1, *(1 for _ in range(emb.dim() - 2))) 37 | enorm = torch.norm(e_reshaped, dim=1, keepdim=False) + 1e-8 38 | # print("BEFORE:{} \t AFTER:{}".format(emb.size(), e_reshaped.size())) 39 | # print("enorm:{}, {}".format(enorm.size(), enorm[:10])) 40 | return enorm 41 | 42 | 43 | class VaSCLPturb(nn.Module): 44 | def __init__(self, xi=0.1, eps=1, ip=1, uni_criterion=None, bi_criterion=None): 45 | """VaSCL_Pturb on Transformer embeddings 46 | :param xi: hyperparameter of VaSCL_Pturb (default: 10.0) 47 | :param eps: hyperparameter of VaSCL_Pturb (default: 1.0) 48 | :param ip: iteration times of computing adv noise (default: 1) 49 | """ 50 | super(VaSCLPturb, self).__init__() 51 | self.xi = xi 52 | self.eps = eps 53 | self.ip = ip 54 | self.delta = 1e-08 55 | 56 | self.uni_criterion = uni_criterion 57 | self.bi_criterion = bi_criterion 58 | print("\n VaSCL_Pturb on embeddings, xi:{}, eps:{} \n".format(xi, eps)) 59 | 60 | def forward(self, model, inputs, hard_indices): 61 | # print(inputs.size(), "\n", _emb_norm(inputs)[:5]) 62 | with torch.no_grad(): 63 | cnst = model.contrast_logits(inputs) 64 | 65 | # prepare random unit tensor 66 | d = torch.rand(inputs.shape).sub(0.5).to(inputs.device) 67 | d = _l2_normalize(d) 68 | 69 | with _disable_tracking_bn_stats(model): 70 | # calc adversarial direction 71 | for _ in range(self.ip): 72 | d.requires_grad_() 73 | cnst_hat = model.contrast_logits(inputs + self.xi * d) 74 | 75 | adv_cnst = self.uni_criterion(cnst, cnst_hat, hard_indices) 76 | adv_distance = adv_cnst['lds_loss'] 77 | 78 | adv_distance.backward(retain_graph=True) 79 | d = _l2_normalize(d.grad) 80 | model.zero_grad() 81 | 82 | cnst_hat = model.contrast_logits(inputs + self.eps * d) 83 | adv_cnst = self.bi_criterion(cnst, cnst_hat, hard_indices) 84 | return adv_cnst 85 | -------------------------------------------------------------------------------- /nlp/utils/whitening_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: zhihao.chen@kuwo.cn 6 | @date: 2022/6/30 10:56 7 | """ 8 | import pickle 9 | import numpy as np 10 | 11 | 12 | def get_embedding(output, pooling_strategy='cls'): 13 | hidden_states = output.hidden_states 14 | if pooling_strategy == 'cls': 15 | output_hidden_state = output.last_hidden_state[:, 0, :] 16 | elif pooling_strategy == 'last_avg': 17 | output_hidden_state = output.last_hidden_state.mean(dim=1) 18 | elif pooling_strategy == 'first_last_avg': 19 | output_hidden_state = hidden_states[-1] + hidden_states[1] 20 | output_hidden_state = output_hidden_state.mean(dim=1) 21 | elif pooling_strategy == 'last2avg': 22 | output_hidden_state = hidden_states[-1] + hidden_states[-2] 23 | output_hidden_state = output_hidden_state.mean(dim=1) 24 | else: 25 | raise ValueError("'pooling_strategy' must one of [fist-last-avg, last-avg, last2avg, cls]") 26 | vec = output_hidden_state.cpu().numpy()[0] 27 | return vec 28 | 29 | 30 | def compute_kernel_bias(vecs): 31 | """计算kernel和bias 32 | 最后的变换:y = (x + bias).dot(kernel) 33 | """ 34 | vecs = np.concatenate(vecs, axis=0) 35 | mu = vecs.mean(axis=0, keepdims=True) 36 | cov = np.cov(vecs.T) 37 | u, s, vh = np.linalg.svd(cov) 38 | w = np.dot(u, np.diag(s ** 0.5)) 39 | w = np.linalg.inv(w.T) 40 | return w, -mu 41 | 42 | 43 | def normalize(vecs): 44 | """标准化 45 | """ 46 | return vecs / (vecs ** 2).sum(axis=1, keepdims=True) ** 0.5 47 | 48 | 49 | def transform_and_normalize(vecs, kernel, bias, dim): 50 | """应用变换,然后标准化 51 | """ 52 | if not (kernel is None or bias is None): 53 | vecs = (vecs + bias).dot(kernel[:, :dim]) 54 | return normalize(vecs) 55 | 56 | 57 | def save_whiten(weight_save_path, kernel, bias): 58 | whiten = { 59 | 'kernel': kernel, 60 | 'bias': bias 61 | } 62 | with open(weight_save_path, 'wb') as f: 63 | pickle.dump(whiten, f) 64 | 65 | 66 | def load_whiten(weight_save_path): 67 | with open(weight_save_path, 'rb') as f: 68 | whiten = pickle.load(f) 69 | kernel = whiten['kernel'] 70 | bias = whiten['bias'] 71 | return kernel, bias 72 | -------------------------------------------------------------------------------- /nlp/utils/wobert_tokenization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: czh 5 | @email: 6 | @date: 2022/11/11 15:07 7 | """ 8 | import jieba 9 | from transformers import BasicTokenizer, BertTokenizer 10 | 11 | 12 | class CustomBasicTokenizer(BasicTokenizer): 13 | def __init__(self, 14 | vocab, 15 | do_lower_case=True, 16 | never_split=None, 17 | tokenize_chinese_chars=True, 18 | strip_accents=None): 19 | super().__init__(do_lower_case=do_lower_case, 20 | never_split=never_split, 21 | tokenize_chinese_chars=tokenize_chinese_chars, 22 | strip_accents=strip_accents) 23 | 24 | self.vocab = vocab 25 | 26 | def _tokenize_chinese_chars(self, text): 27 | output = [] 28 | ''' 29 | 1、输入一个句子s,用pre_tokenize先分一次词,得到[w1,w2,…,wl]; 30 | 2、遍历各个wi,如果wi在词表中则保留,否则将wi用BERT自带的tokenize函数再分一次; 31 | 3、将每个wi的tokenize结果有序拼接起来,作为最后的tokenize结果。 32 | ''' 33 | for wholeword in jieba.cut(text, HMM=False): 34 | if wholeword in self.vocab: 35 | output.append(" ") 36 | output.append(wholeword) 37 | output.append(" ") 38 | else: 39 | for char in wholeword: 40 | cp = ord(char) 41 | if self._is_chinese_char(cp): 42 | output.append(" ") 43 | output.append(char) 44 | output.append(" ") 45 | else: 46 | output.append(char) 47 | return "".join(output) 48 | 49 | 50 | class WoBertTokenizer(BertTokenizer): 51 | def __init__(self, 52 | vocab_file, 53 | do_lower_case=True, 54 | do_basic_tokenize=True, 55 | never_split=None, 56 | unk_token="[UNK]", 57 | sep_token="[SEP]", 58 | pad_token="[PAD]", 59 | cls_token="[CLS]", 60 | mask_token="[MASK]", 61 | tokenize_chinese_chars=True, 62 | strip_accents=None, 63 | **kwargs): 64 | super().__init__(vocab_file, 65 | do_lower_case=do_lower_case, 66 | do_basic_tokenize=do_basic_tokenize, 67 | never_split=never_split, 68 | unk_token=unk_token, 69 | sep_token=sep_token, 70 | pad_token=pad_token, 71 | cls_token=cls_token, 72 | mask_token=mask_token, 73 | tokenize_chinese_chars=tokenize_chinese_chars, 74 | strip_accents=strip_accents, 75 | **kwargs) 76 | if self.do_basic_tokenize: 77 | self.basic_tokenizer = CustomBasicTokenizer( 78 | vocab=self.vocab, 79 | do_lower_case=do_lower_case, 80 | never_split=never_split, 81 | tokenize_chinese_chars=tokenize_chinese_chars, 82 | strip_accents=strip_accents, 83 | ) 84 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | numpy 3 | pydantic 4 | seqeval 5 | torch 6 | transformers 7 | networkx 8 | stanza 9 | bert4keras 10 | Keras~=2.3.1 11 | regex 12 | scikit-learn 13 | requests 14 | tqdm 15 | six 16 | scipy 17 | nltk 18 | PyYAML 19 | sentence_transformers --------------------------------------------------------------------------------