├── .gitignore ├── LICENSE ├── README.md ├── adversarial_training ├── adversarial.py ├── bert_at.py └── lstm_at.py ├── data ├── LCQMC │ ├── lcqmc_dev.csv │ └── lcqmc_train.csv ├── rank │ ├── qa_data.csv │ └── sort_data.csv ├── sentiment │ ├── sentiment.test.data │ ├── sentiment.train.data │ └── sentiment.valid.data └── tnews_public │ ├── dev.csv │ └── train.csv ├── data_augmentation ├── bert_mixup.py ├── data_augmentation.py ├── deepl.py ├── feature.py └── feature_augmentation.py ├── distillation ├── distillation_student.py ├── train_student.py └── train_teacher.py ├── elmoformanylangs ├── __init__.py ├── __main__.py ├── biLM.py ├── configs │ ├── cnn_0_100_512_4096_sample.json │ └── cnn_50_100_512_4096_sample.json ├── dataloader.py ├── elmo.py ├── frontend.py ├── main.py ├── modules │ ├── __init__.py │ ├── classify_layer.py │ ├── elmo.py │ ├── embedding_layer.py │ ├── encoder_base.py │ ├── highway.py │ ├── lstm.py │ ├── lstm_cell_with_projection.py │ ├── token_embedder.py │ └── util.py └── utils.py ├── gpt ├── chat.py ├── chitchat │ ├── __init__.py │ ├── config │ │ └── model_config_dialogue_small.json │ ├── data │ │ └── .gitkeep │ ├── dataset.py │ ├── generate_dialogue_subset.py │ ├── interact.py │ ├── interact_mmi.py │ ├── train.py │ └── vocabulary │ │ └── vocab_small.txt └── gpt_lyric.py ├── pseudo ├── first_stage.py └── second_stage.py ├── ptm ├── .gitkeep ├── post train_bert.py ├── post train_gpt.py └── train_bert.py ├── rank ├── main.py ├── model.py ├── rank.py └── train_ndcg.py ├── text_classification ├── bert.py └── text_classification.py ├── text_representation ├── sentence_embedding.py ├── synonym.py ├── word2vec │ └── .gitkeep └── word2vec_gensim.py ├── text_similarity ├── dssm.py ├── esim.py └── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .idea 132 | text_representation/word2vec/wiki.model 133 | text_representation/word2vec/wiki.model.trainables.syn1neg.npy 134 | text_representation/word2vec/wiki.model.wv.vectors.npy 135 | text_representation/word2vec/wiki.txt 136 | 137 | 138 | ptm/elmo 139 | 140 | gpt/chitchat/dialogue_model 141 | distillation/model.bin 142 | distillation/student.bin 143 | distillation/teacher.bin 144 | pseudo/pseudo.csv 145 | rank/best_model.bin 146 | rank/test.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlp_tutorial 2 | 3 | ### 文本表示 4 | 5 | 1、词向量训练数据 6 | 链接:https://pan.baidu.com/s/1cWt2qqH5ym0vLcjihehfGg 提取码:z6dt 7 | 8 | 2、中文版本的ELMo模型 9 | https://pan.baidu.com/s/1RNKnj6hgL-2orQ7f38CauA 10 | 11 | 3、哈工大开源的多语言ELMo源码 12 | https://github.com/HIT-SCIR/ELMoForManyLangs 13 | 14 | 15 | ### 文本匹配模型 16 | 17 | 我自己整理的各种匹配模型 18 | https://blog.csdn.net/u012526436/article/details/90179466 19 | 20 | ### huggingface - transformers 21 | 22 | 模型仓库 23 | https://huggingface.co/models 24 | 25 | transformers 26 | https://github.com/huggingface/transformers 27 | 28 | ### gpt 29 | 30 | 闲聊 31 | https://github.com/yangjianxin1/GPT2-chitchat 32 | -------------------------------------------------------------------------------- /adversarial_training/adversarial.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class AT: 5 | def __init__(self, model): 6 | self.model = model 7 | self.backup = {} 8 | 9 | def attack(self, emb_name='emb.'): 10 | """ 11 | 备份embedding matrix 并添加我们的扰动项 12 | :param emb_name: embedding层的名字 13 | """ 14 | raise NotImplemented 15 | 16 | def restore(self, emb_name='emb.'): 17 | """ 18 | 把embedding matrix的参数恢复 19 | :param emb_name: embedding层的名字 20 | """ 21 | for name, param in self.model.named_parameters(): 22 | if param.requires_grad and emb_name in name: 23 | assert name in self.backup 24 | param.data = self.backup[name] 25 | self.backup = {} 26 | 27 | 28 | class FGM(AT): 29 | def attack(self, epsilon=1., emb_name='emb.'): 30 | for name, param in self.model.named_parameters(): 31 | if param.requires_grad and emb_name in name: 32 | self.backup[name] = param.data.clone() 33 | norm = torch.norm(param.grad) 34 | if norm != 0 and not torch.isnan(norm): 35 | r_at = epsilon * param.grad / norm 36 | param.data.add_(r_at) 37 | 38 | 39 | class FGSM(AT): 40 | def attack(self, epsilon=1., emb_name='emb.'): 41 | for name, param in self.model.named_parameters(): 42 | if param.requires_grad and emb_name in name: 43 | self.backup[name] = param.data.clone() 44 | norm = torch.norm(param.grad) 45 | if norm != 0 and not torch.isnan(norm): 46 | r_at = epsilon * torch.sign(param.grad) 47 | param.data.add_(r_at) 48 | 49 | 50 | class FreeAT(AT): 51 | 52 | def __init__(self, model): 53 | super().__init__(model) 54 | self.grad_backup = {} 55 | 56 | def attack(self, epsilon=0.3, alpha=0.01, emb_name='emb.', first_attack=False): 57 | for name, param in self.model.named_parameters(): 58 | if param.requires_grad and emb_name in name: 59 | if first_attack: 60 | self.backup[name] = param.data.clone() 61 | norm = torch.norm(param.grad) 62 | if norm != 0 and not torch.isnan(norm): 63 | # 得到新的扰动 64 | r_at = alpha * param.grad / norm 65 | r_at = torch.clamp(r_at, - epsilon, epsilon) 66 | # 加到输入上 67 | param.data.add_(r_at) 68 | 69 | def backup_grad(self): 70 | for name, param in self.model.named_parameters(): 71 | if param.requires_grad: 72 | self.grad_backup[name] = param.grad.clone() 73 | 74 | def restore_grad(self): 75 | for name, param in self.model.named_parameters(): 76 | if param.requires_grad: 77 | param.grad = self.grad_backup[name] 78 | 79 | 80 | class FreeLB(AT): 81 | 82 | def __init__(self, model): 83 | super().__init__(model) 84 | self.grad_backup = {} 85 | 86 | def attack(self, epsilon=0.01, alpha=5e-3, emb_name='emb.', first_attack=False): 87 | for name, param in self.model.named_parameters(): 88 | if param.requires_grad and emb_name in name: 89 | if first_attack: 90 | r_at = torch.Tensor(1).uniform_(-epsilon, epsilon) 91 | else: 92 | norm = torch.norm(param.grad) 93 | if norm != 0 and not torch.isnan(norm): 94 | r_at = alpha * param.grad / norm 95 | r_at = torch.clamp(r_at, - epsilon, epsilon) 96 | param.data.add_(r_at) 97 | 98 | def backup_grad(self): 99 | for name, param in self.model.named_parameters(): 100 | if param.requires_grad: 101 | self.grad_backup[name] = param.grad.clone() 102 | 103 | def restore_grad(self): 104 | for name, param in self.model.named_parameters(): 105 | if param.requires_grad: 106 | param.grad = self.grad_backup[name] 107 | -------------------------------------------------------------------------------- /adversarial_training/bert_at.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | from transformers import BertForSequenceClassification, BertTokenizer 3 | from torch.utils.data import DataLoader, Dataset 4 | from tqdm import tqdm 5 | from adversarial_training.adversarial import * 6 | from utils import fix_seed 7 | 8 | tokenizer = BertTokenizer.from_pretrained('E:\\ptm\\roberta') 9 | 10 | 11 | class BaseDataset(Dataset): 12 | def __init__(self, encodings, labels=None): 13 | self.encodings = encodings 14 | self.labels = labels 15 | 16 | def __getitem__(self, idx): 17 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 18 | if self.labels is not None: 19 | item['labels'] = torch.tensor(self.labels[idx]) 20 | return item 21 | 22 | def __len__(self): 23 | return len(self.encodings['input_ids']) 24 | 25 | 26 | def load_data(batch_size=32): 27 | train_text = [] 28 | train_label = [] 29 | with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file: 30 | for line in file.readlines(): 31 | t, l = line.strip().split('\t') 32 | train_text.append(t) 33 | train_label.append(int(l)) 34 | 35 | train_text = tokenizer(text=train_text, 36 | return_tensors='pt', 37 | truncation=True, 38 | padding=True, 39 | max_length=10) 40 | 41 | train_loader = DataLoader(BaseDataset(train_text, train_label), 42 | batch_size, 43 | pin_memory=True if torch.cuda.is_available() else False, 44 | shuffle=False) 45 | 46 | dev_text = [] 47 | dev_label = [] 48 | with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file: 49 | for line in file.readlines(): 50 | t, l = line.strip().split('\t') 51 | dev_text.append(t) 52 | dev_label.append(int(l)) 53 | 54 | dev_text = tokenizer(text=dev_text, 55 | return_tensors='pt', 56 | truncation=True, 57 | padding=True, 58 | max_length=10) 59 | 60 | dev_loader = DataLoader(BaseDataset(dev_text, dev_label), 61 | batch_size, 62 | pin_memory=True if torch.cuda.is_available() else False, 63 | shuffle=False) 64 | 65 | return train_loader, dev_loader 66 | 67 | 68 | # 训练模型 69 | def train(): 70 | fix_seed() 71 | 72 | train_data_loader, dev_data_loader = load_data(32) 73 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 74 | model = BertForSequenceClassification.from_pretrained('E:\\ptm\\roberta', num_labels=2) 75 | model = model.to(device) 76 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) 77 | 78 | attack = True 79 | 80 | if attack: 81 | # at = FGM(model) 82 | at = FreeAT(model) 83 | 84 | def adversarial(data): 85 | optimizer.zero_grad() 86 | # 添加扰动 87 | at.attack(emb_name='embeddings.word_embeddings.weight') 88 | # 重新计算梯度 89 | adv_loss = model(input_ids=data['input_ids'].to(device), 90 | attention_mask=data['attention_mask'].to(device), 91 | labels=data['labels'].to(device)).loss 92 | # bp得到新的梯度 93 | adv_loss.backward() 94 | at.restore(emb_name='embeddings.word_embeddings.weight') 95 | 96 | def adversarial_free(data, m=3): 97 | # 备份梯度 98 | at.backup_grad() 99 | for i in range(m): 100 | at.attack(emb_name='embeddings.word_embeddings.weight', first_attack=i == 0) 101 | if i == 0: 102 | optimizer.zero_grad() 103 | else: 104 | at.restore_grad() 105 | # fp 106 | adv_loss = model(input_ids=data['input_ids'].to(device), 107 | attention_mask=data['attention_mask'].to(device), 108 | labels=data['labels'].to(device)).loss 109 | # bp得到新的梯度 110 | adv_loss.backward() 111 | at.restore(emb_name='embeddings.word_embeddings.weight') 112 | 113 | for epoch in range(5): 114 | print('epoch:', epoch + 1) 115 | pred = [] 116 | label = [] 117 | pbar = tqdm(train_data_loader) 118 | for data in pbar: 119 | optimizer.zero_grad() 120 | 121 | input_ids = data['input_ids'].to(device) 122 | attention_mask = data['attention_mask'].to(device) 123 | labels = data['labels'].to(device).long() 124 | 125 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 126 | output = outputs.logits.argmax(1).cpu().numpy() 127 | pred.extend(output) 128 | label.extend(labels.cpu().numpy()) 129 | loss = outputs.loss 130 | loss.backward() 131 | 132 | if attack: 133 | # adversarial(data) 134 | adversarial_free(data) 135 | 136 | optimizer.step() 137 | 138 | pbar.update() 139 | pbar.set_description(f'loss:{loss.item():.4f}') 140 | 141 | acc = accuracy_score(pred, label) 142 | print('train acc:', acc) 143 | 144 | pred = [] 145 | label = [] 146 | for data in tqdm(dev_data_loader): 147 | input_ids = data['input_ids'].to(device) 148 | attention_mask = data['attention_mask'].to(device) 149 | labels = data['labels'].to(device).long() 150 | with torch.no_grad(): 151 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 152 | output = outputs.logits.argmax(1).cpu().numpy() 153 | pred.extend(output) 154 | label.extend(labels.cpu().numpy()) 155 | acc = accuracy_score(pred, label) 156 | print('dev acc:', acc) 157 | print() 158 | 159 | 160 | if __name__ == '__main__': 161 | train() 162 | -------------------------------------------------------------------------------- /adversarial_training/lstm_at.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import jieba 4 | import numpy as np 5 | from collections import defaultdict 6 | from torch.utils.data import DataLoader, TensorDataset 7 | from sklearn.metrics import accuracy_score 8 | 9 | 10 | class TextCLS(torch.nn.Module): 11 | # 准备我们需要用到的参数和layer 12 | def __init__(self, 13 | vocab_size, 14 | embedding_size): 15 | super().__init__() 16 | self.embedding = nn.Embedding(vocab_size, embedding_size) 17 | # [batch_size, seq_len, hidden_size] 18 | self.lstm = nn.LSTM(input_size=embedding_size, 19 | hidden_size=256, 20 | num_layers=2, 21 | batch_first=True) 22 | self.dense1 = nn.Linear(256, 100) 23 | self.dense2 = nn.Linear(100, 2) 24 | 25 | # 前向传播,那我们准备好的layer拼接在一起 26 | def forward(self, x): 27 | embedding = self.embedding(x) 28 | # [batch_size, seq_len, hidden_size] 29 | out, _ = self.lstm(embedding) 30 | out = self.dense1(out[:, -1, :]) 31 | out = self.dense2(out) 32 | return out 33 | 34 | 35 | def tokenize(string): 36 | res = list(jieba.cut(string, cut_all=False)) 37 | return res 38 | 39 | 40 | # 把数据转换成index 41 | def seq2index(seq, vocab): 42 | seg = tokenize(seq) 43 | seg_index = [] 44 | for s in seg: 45 | seg_index.append(vocab.get(s, 1)) 46 | return seg_index 47 | 48 | 49 | # 统一长度 50 | def padding_seq(X, max_len=10): 51 | return np.array([ 52 | np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X 53 | ]) 54 | 55 | 56 | def load_data(batch_size=32): 57 | train_text = [] 58 | train_label = [] 59 | with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file: 60 | for line in file.readlines(): 61 | t, l = line.strip().split('\t') 62 | train_text.append(t) 63 | train_label.append(int(l)) 64 | 65 | dev_text = [] 66 | dev_label = [] 67 | with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file: 68 | for line in file.readlines(): 69 | t, l = line.strip().split('\t') 70 | dev_text.append(t) 71 | dev_label.append(int(l)) 72 | 73 | # 生成词典 74 | segment = [tokenize(t) for t in train_text] 75 | 76 | word_frequency = defaultdict(int) 77 | for row in segment: 78 | for i in row: 79 | word_frequency[i] += 1 80 | 81 | word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True) # 根据词频降序排序 82 | 83 | vocab = {'[PAD]': 0, '[UNK]': 1} 84 | for d in word_sort: 85 | vocab[d[0]] = len(vocab) 86 | 87 | train_x = padding_seq([seq2index(t, vocab) for t in train_text]) 88 | train_y = np.array(train_label) 89 | train_data_set = TensorDataset(torch.from_numpy(train_x), 90 | torch.from_numpy(train_y)) 91 | train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size) 92 | 93 | dev_x = padding_seq([seq2index(t, vocab) for t in dev_text]) 94 | dev_y = np.array(dev_label) 95 | dev_data_set = TensorDataset(torch.from_numpy(dev_x), 96 | torch.from_numpy(dev_y)) 97 | dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size) 98 | 99 | return train_data_loader, dev_data_loader, vocab 100 | 101 | 102 | # 训练模型 103 | def train(): 104 | train_data_loader, dev_data_loader, vocab = load_data(128) 105 | model = TextCLS(vocab_size=len(vocab), 106 | embedding_size=100) 107 | 108 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 109 | loss_func = nn.CrossEntropyLoss() 110 | 111 | if torch.cuda.is_available(): 112 | model = model.cuda() 113 | 114 | backup = {} 115 | epsilon = 1 116 | attack = False 117 | 118 | for epoch in range(5): 119 | print('epoch:', epoch + 1) 120 | pred = [] 121 | label = [] 122 | for step, (b_x, b_y) in enumerate(train_data_loader): 123 | optimizer.zero_grad() 124 | if torch.cuda.is_available(): 125 | b_x = b_x.cuda().long() 126 | b_y = b_y.cuda().long() 127 | output = model(b_x) 128 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 129 | label.extend(b_y.cpu().numpy()) 130 | loss = loss_func(output, b_y) 131 | loss.backward() 132 | 133 | if attack: 134 | # 备份参数,添加扰动 135 | for name, param in model.embedding.named_parameters(): 136 | backup[name] = param.data.clone() 137 | norm = torch.norm(param.grad) 138 | if norm != 0 and not torch.isnan(norm): 139 | r_at = epsilon * param.grad / norm 140 | param.data.add_(r_at) 141 | 142 | # 第二次fp与bp 143 | optimizer.zero_grad() 144 | output = model(b_x) 145 | loss = loss_func(output, b_y) 146 | loss.backward() 147 | 148 | # 恢复参数 149 | for name, param in model.embedding.named_parameters(): 150 | param.data = backup[name] 151 | backup = {} 152 | 153 | # 更新权重 154 | optimizer.step() 155 | acc = accuracy_score(pred, label) 156 | print('train acc:', acc) 157 | 158 | pred = [] 159 | label = [] 160 | for step, (b_x, b_y) in enumerate(dev_data_loader): 161 | if torch.cuda.is_available(): 162 | b_x = b_x.cuda().long() 163 | b_y = b_y.cuda().long() 164 | with torch.no_grad(): 165 | output = model(b_x) 166 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 167 | label.extend(b_y.cpu().numpy()) 168 | acc = accuracy_score(pred, label) 169 | print('dev acc:', acc) 170 | print() 171 | 172 | 173 | if __name__ == '__main__': 174 | train() 175 | -------------------------------------------------------------------------------- /data_augmentation/bert_mixup.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | from transformers import BertForSequenceClassification, BertTokenizer 3 | from torch.utils.data import DataLoader, Dataset 4 | from tqdm import tqdm 5 | from utils import fix_seed 6 | import torch 7 | import pandas as pd 8 | from feature_augmentation import MixUp 9 | 10 | path = 'E:\\ptm\\roberta' 11 | tokenizer = BertTokenizer.from_pretrained(path) 12 | 13 | 14 | class BaseDataset(Dataset): 15 | def __init__(self, encodings, labels=None): 16 | self.encodings = encodings 17 | self.labels = labels 18 | 19 | def __getitem__(self, idx): 20 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 21 | if self.labels is not None: 22 | item['labels'] = torch.tensor(self.labels[idx]) 23 | return item 24 | 25 | def __len__(self): 26 | return len(self.encodings['input_ids']) 27 | 28 | 29 | def load_data(batch_size=32): 30 | train_df = pd.read_csv('../data/tnews_public/train.csv') 31 | train_text = train_df['text'].tolist() 32 | train_label = train_df['label'].tolist() 33 | train_text = tokenizer(text=train_text, 34 | return_tensors='pt', 35 | truncation=True, 36 | padding=True, 37 | max_length=32) 38 | train_loader = DataLoader(BaseDataset(train_text, train_label), 39 | batch_size, 40 | pin_memory=True if torch.cuda.is_available() else False, 41 | shuffle=False) 42 | 43 | dev_df = pd.read_csv('../data/tnews_public/dev.csv') 44 | dev_text = dev_df['text'].tolist() 45 | dev_label = dev_df['label'].tolist() 46 | dev_text = tokenizer(text=dev_text, 47 | return_tensors='pt', 48 | truncation=True, 49 | padding=True, 50 | max_length=32) 51 | 52 | dev_loader = DataLoader(BaseDataset(dev_text, dev_label), 53 | batch_size, 54 | pin_memory=True if torch.cuda.is_available() else False, 55 | shuffle=False) 56 | 57 | return train_loader, dev_loader 58 | 59 | 60 | # 训练模型 61 | def train(): 62 | fix_seed() 63 | 64 | train_data_loader, dev_data_loader = load_data(128) 65 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 66 | 67 | model = BertForSequenceClassification.from_pretrained(path, num_labels=4) 68 | model = model.to(device) 69 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) 70 | 71 | mixup = MixUp(model, tokenizer, 4) 72 | 73 | best_acc = 0 74 | for epoch in range(5): 75 | print('epoch:', epoch + 1) 76 | pred = [] 77 | label = [] 78 | pbar = tqdm(train_data_loader) 79 | for data in pbar: 80 | # zero_grad,backward之后梯度都会进行累加 81 | optimizer.zero_grad() 82 | 83 | input_ids = data['input_ids'].to(device) 84 | attention_mask = data['attention_mask'].to(device) 85 | labels = data['labels'].to(device).long() 86 | 87 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 88 | output = outputs.logits.argmax(1).cpu().numpy() 89 | pred.extend(output) 90 | label.extend(labels.cpu().numpy()) 91 | loss = outputs.loss / 2 92 | loss.backward() 93 | 94 | mix_loss = mixup.augmentation(data) / 2 95 | mix_loss.backward() 96 | 97 | optimizer.step() 98 | 99 | pbar.update() 100 | pbar.set_description(f'loss:{loss.item():.4f}') 101 | 102 | acc = accuracy_score(pred, label) 103 | print('train acc:', acc) 104 | 105 | pred = [] 106 | label = [] 107 | for data in tqdm(dev_data_loader): 108 | input_ids = data['input_ids'].to(device) 109 | attention_mask = data['attention_mask'].to(device) 110 | labels = data['labels'].to(device).long() 111 | with torch.no_grad(): 112 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 113 | output = outputs.logits.argmax(1).cpu().numpy() 114 | pred.extend(output) 115 | label.extend(labels.cpu().numpy()) 116 | acc = accuracy_score(pred, label) 117 | print('dev acc:', acc) 118 | print() 119 | if acc > best_acc: 120 | torch.save(model.state_dict(), 'teacher.bin') 121 | best_acc = acc 122 | 123 | 124 | if __name__ == '__main__': 125 | train() 126 | -------------------------------------------------------------------------------- /data_augmentation/data_augmentation.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from transformers import BertTokenizer, BertForMaskedLM 4 | from utils import get_device, punctuation 5 | import jieba 6 | 7 | 8 | class EDA: 9 | """ 10 | 替换同意词、插入同意词、交换词的顺序、删除词 11 | """ 12 | 13 | def __init__(self): 14 | import synonyms 15 | self.synonyms = synonyms 16 | self.stop_words = synonyms.synonyms._stopwords 17 | self.word_dic = {} 18 | 19 | def augmentation(self, text, rate=0.3): 20 | 21 | replace_text = self.replace(text, rate=rate) 22 | insert_text = self.insert(text, rate=rate) 23 | swap_text = self.swap(text) 24 | delete_text = self.delete(text, rate=rate) 25 | all_text = list({text, replace_text, insert_text, swap_text, delete_text}) 26 | return all_text 27 | 28 | def replace(self, text, rate=0.2): 29 | segment = list(jieba.cut(text)) 30 | words_index = [] 31 | for i, s in enumerate(segment): 32 | if s not in self.stop_words: 33 | words_index.append(i) 34 | if not words_index: 35 | return text 36 | num = max(1, round(len(words_index) * rate)) 37 | index = [random.choice(words_index) for _ in range(num)] 38 | 39 | for i in index: 40 | try: 41 | if segment[i] in self.word_dic.keys(): 42 | segment[i] = self.word_dic[segment[i]] 43 | else: 44 | new_word = self.synonyms.nearby(segment[i])[0][1] 45 | segment[i] = new_word 46 | except: 47 | pass 48 | 49 | return ''.join(segment) 50 | 51 | def insert(self, text, rate=0.2): 52 | segment = list(jieba.cut(text)) 53 | words_index = [] 54 | for i, s in enumerate(segment): 55 | if s not in self.stop_words: 56 | words_index.append(i) 57 | if not words_index: 58 | return text 59 | num = max(1, round(len(words_index) * rate)) 60 | index = [random.choice(words_index) for _ in range(num)] 61 | 62 | for i in index: 63 | try: 64 | if segment[i] in self.word_dic.keys(): 65 | segment[i] = self.word_dic[segment[i]] 66 | else: 67 | new_word = self.synonyms.nearby(segment[i])[0][1] 68 | segment[i] = new_word 69 | except: 70 | pass 71 | 72 | return ''.join(segment) 73 | 74 | def swap(self, text): 75 | segment = list(jieba.cut(text)) 76 | if len(segment) <= 2: 77 | return text 78 | 79 | choice_word = [1, 1] 80 | while choice_word[0] == choice_word[1]: 81 | choice_word = random.choices(segment, k=2) 82 | 83 | segment[segment.index(choice_word[0])] = choice_word[1] 84 | segment[segment.index(choice_word[1])] = choice_word[0] 85 | 86 | return ''.join(segment) 87 | 88 | def delete(self, text, rate=0.2): 89 | segment = list(jieba.cut(text)) 90 | for i in range(len(segment)): 91 | if random.random() < rate: 92 | segment[i] = '' 93 | return ''.join(segment) 94 | 95 | 96 | class AEDA: 97 | """ 98 | 随机添加标点 99 | https://arxiv.org/pdf/2108.13230.pdf 100 | """ 101 | 102 | def __init__(self): 103 | self.punctuation = punctuation() 104 | 105 | def augmentation(self, text): 106 | length = int(len(text) * 0.3) 107 | if length < 2: 108 | return text 109 | punc_len = random.randint(1, length) 110 | puncs = random.choices(self.punctuation, k=punc_len) 111 | text = list(text) 112 | for p in puncs: 113 | text.insert(random.randint(0, len(text) - 1), p) 114 | return ''.join(text) 115 | 116 | 117 | class BackTranslation: 118 | """ 119 | 回译 120 | """ 121 | 122 | def __init__(self): 123 | from deepl import DeepL 124 | self.deep = DeepL() 125 | 126 | def augmentation(self, text): 127 | english = self.deep.translate('zh', 'en', text) 128 | translate = self.deep.translate('en', 'zh', english) 129 | return translate 130 | 131 | 132 | class WoTokenizer(BertTokenizer): 133 | def __init__(self, pre_tokenizer=lambda x: jieba.cut(x, HMM=False), *args, **kwargs): 134 | super().__init__(*args, **kwargs) 135 | self.pre_tokenizer = pre_tokenizer 136 | 137 | def _tokenize(self, text, *arg, **kwargs): 138 | split_tokens = [] 139 | for word in self.pre_tokenizer(text): 140 | if word in self.vocab: 141 | split_tokens.append(word) 142 | else: 143 | split_tokens.extend(super()._tokenize(word)) 144 | return split_tokens 145 | 146 | 147 | class LMAug: 148 | """ 149 | 基于mlm的数据增强,这里使用了wobert,对词mask 150 | """ 151 | 152 | def __init__(self): 153 | model_path = 'E:\\ptm\\wobert' 154 | self.tokenizer = WoTokenizer.from_pretrained(model_path) 155 | self.model = BertForMaskedLM.from_pretrained(model_path).eval().to(get_device()) 156 | 157 | def augmentation(self, text, topk=3): 158 | input_ids = self.tokenizer(text, return_tensors='pt')['input_ids'][0] 159 | random_index = random.randint(1, len(input_ids) - 2) 160 | input_ids[random_index] = 103 161 | # mask_text = ''.join(segment) 162 | # 163 | # input_ids = self.tokenizer(mask_text, return_tensors='pt', max_length=512)['input_ids'].to(get_device()) 164 | mask_index = [i for i, d in enumerate(input_ids) if d == 103] 165 | mask = input_ids == 103 166 | 167 | res = self.model(input_ids[None, :].to(get_device())).logits[0][mask] 168 | sort_res = torch.argsort(res, dim=1, descending=True) 169 | index = sort_res[:, 0:topk] 170 | 171 | out_text = [] 172 | for idx in index.T: 173 | new_input_ids = input_ids 174 | for i, m_idx in zip(idx, mask_index): 175 | new_input_ids[m_idx] = i 176 | text = self.tokenizer.convert_ids_to_tokens(new_input_ids) 177 | text = ''.join(text[1:-1]).replace('#', '') 178 | out_text.append(text) 179 | return out_text 180 | 181 | 182 | class Augmentation: 183 | """ 184 | 数据增强 185 | 前4条是EDA,第5条是AEDA,第6条是回译,第7-9条是MLM,最后一条是GPT 186 | """ 187 | 188 | def __init__(self, use_br=False, aug_list=None): 189 | if aug_list is None: 190 | aug_list = [ 191 | EDA(), 192 | AEDA(), 193 | LMAug() 194 | ] 195 | if use_br: 196 | aug_list.append(BackTranslation()) 197 | self.aug_list = aug_list 198 | 199 | def augmentation(self, text): 200 | aug_text = [] 201 | for aug in self.aug_list: 202 | text_res = aug.augmentation(text) 203 | if isinstance(text_res, str): 204 | aug_text.append(text_res) 205 | elif isinstance(text_res, list): 206 | aug_text.extend(text_res) 207 | 208 | return aug_text 209 | 210 | 211 | if __name__ == '__main__': 212 | # print(EDA().augmentation('今天天气真好啊')) 213 | # print(AEDA().augmentation('今天天气真好啊')) 214 | # print(BackTranslation().augmentation('今天天气真好啊')) 215 | print(LMAug().augmentation('今天天气真好啊', topk=5)) 216 | -------------------------------------------------------------------------------- /data_augmentation/deepl.py: -------------------------------------------------------------------------------- 1 | from webdriver_manager.chrome import ChromeDriverManager 2 | from selenium import webdriver 3 | from selenium.webdriver.chrome.options import Options 4 | import time 5 | from bs4 import BeautifulSoup 6 | import urllib.parse 7 | 8 | 9 | class DeepL: 10 | def __init__(self): 11 | options = Options() 12 | options.add_argument('--headless') 13 | 14 | self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) 15 | 16 | def translate(self, from_lang: str, to_lang: str, from_text: str) -> str: 17 | sleep_time = 1 18 | from_text = urllib.parse.quote(from_text) 19 | url = 'https://www.deepl.com/translator#' \ 20 | + from_lang + '/' + to_lang + '/' + from_text 21 | self.driver.get(url) 22 | self.driver.implicitly_wait(10) 23 | to_text = None 24 | for i in range(30): 25 | time.sleep(sleep_time) 26 | html = self.driver.page_source 27 | to_text = self.get_text_from_page_source(html) 28 | 29 | if to_text: 30 | break 31 | return to_text 32 | 33 | def get_text_from_page_source(self, html: str) -> str: 34 | soup = BeautifulSoup(html, features='html.parser') 35 | target_elem = soup.find(class_="lmt__translations_as_text__text_btn") 36 | text = None 37 | if target_elem is not None: 38 | text = target_elem.text 39 | return text 40 | 41 | 42 | if __name__ == '__main__': 43 | content = """ 44 | We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. 45 | BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement). 46 | """ 47 | res = DeepL().translate('en', 'zh', content) 48 | print(res) 49 | -------------------------------------------------------------------------------- /data_augmentation/feature.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | from utils import get_device, one_hot 5 | 6 | 7 | class MixUp: 8 | """ 9 | 1、正常训练,bp 10 | 2、mixup训练,bp 11 | 3、梯度累加,更新权重 12 | """ 13 | 14 | def __init__(self, model, tokenizer, num_labels, layer='embedding'): 15 | self.tokenizer = tokenizer 16 | self.model = model 17 | self.device = get_device() 18 | self.num_labels = num_labels 19 | self.layer = layer 20 | 21 | def cross_entropy(self, logits, label): 22 | exp_logits = torch.exp(logits) 23 | log_prob = logits - torch.log(torch.sum(exp_logits, dim=1, keepdim=True)) 24 | return -torch.mean(torch.sum(log_prob * label, dim=1)) 25 | 26 | def augmentation(self, data): 27 | # 这里的data是一个batch的数据 28 | input_ids = data['input_ids'].to(self.device) 29 | attention_mask = data['attention_mask'].to(self.device) 30 | label = data['labels'].to(self.device).long() 31 | 32 | # 把batch内的数据打乱 4 33 | batch_size = len(data) 34 | # [1, 3, 2, 0] 35 | index = torch.randperm(batch_size).to(self.device) 36 | lam = np.random.beta(0.5, 0.5) 37 | 38 | label_mix = one_hot(label, self.num_labels) * lam + one_hot(label[index], self.num_labels) * (1 - lam) 39 | 40 | def my_hook(module, inputs, outputs): 41 | x_mix = outputs * lam + outputs[index] * (1 - lam) 42 | return x_mix 43 | 44 | # pytorch的钩子 45 | hook = self.model.bert.embeddings.register_forward_hook(my_hook) 46 | 47 | outputs = self.model(input_ids, attention_mask) 48 | logits = outputs.logits 49 | hook.remove() 50 | 51 | loss = self.cross_entropy(logits, label_mix) 52 | return loss 53 | -------------------------------------------------------------------------------- /data_augmentation/feature_augmentation.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | from utils import get_device, one_hot 5 | 6 | 7 | class MixUp: 8 | 9 | def __init__(self, model, tokenizer, num_labels, layer='embedding'): 10 | self.tokenizer = tokenizer 11 | self.model = model 12 | self.device = get_device() 13 | self.num_labels = num_labels 14 | self.layer = layer 15 | 16 | def cross_entropy(self, logits, labels): 17 | exp_logits = torch.exp(logits) 18 | log_prob = logits - torch.log(torch.sum(exp_logits, dim=1, keepdim=True)) 19 | return -torch.mean(torch.sum(log_prob * labels, dim=1)) 20 | 21 | def augmentation(self, data): 22 | input_ids = data['input_ids'].to(self.device) 23 | attention_mask = data['attention_mask'].to(self.device) 24 | label = data['labels'].to(self.device).long() 25 | 26 | batch_size = len(input_ids) 27 | # 打乱顺序用于计算mix embedding 28 | index = torch.randperm(batch_size).to(self.device) 29 | lam = np.random.beta(0.5, 0.5) 30 | 31 | label_mix = one_hot(label, self.num_labels) * lam + one_hot(label[index], self.num_labels) * (1 - lam) 32 | hook = None 33 | 34 | def single_forward_hook(module, inputs, outputs): 35 | mix_input = outputs * lam + outputs[index] * (1 - lam) 36 | return mix_input 37 | 38 | def multi_forward_hook(module, inputs, outputs): 39 | mix_input = outputs[0] * lam + outputs[0][index] * (1 - lam) 40 | return tuple([mix_input]) 41 | 42 | if self.layer == 'embedding': 43 | hook = self.model.bert.embeddings.register_forward_hook(single_forward_hook) 44 | elif self.layer == 'pooler': 45 | hook = self.model.bert.pooler.register_forward_hook(single_forward_hook) 46 | elif self.layer == 'inner': 47 | # 随机选一层 48 | layer_num = random.randint(1, self.model.config.num_hidden_layers) - 1 49 | hook = self.model.bert.encoder.layer[layer_num].register_forward_hook(multi_forward_hook) 50 | 51 | outputs = self.model(input_ids=input_ids, 52 | attention_mask=attention_mask, 53 | labels=label.to(self.device)) 54 | logits = outputs.logits 55 | hook.remove() 56 | 57 | # 计算loss 58 | loss = self.cross_entropy(logits, label_mix) 59 | return loss 60 | -------------------------------------------------------------------------------- /distillation/distillation_student.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from sklearn.metrics import accuracy_score 3 | import pandas as pd 4 | from torch.utils.data import DataLoader, Dataset 5 | from tqdm import tqdm 6 | from utils import fix_seed 7 | import torch 8 | from transformers import BertForSequenceClassification, BertTokenizer 9 | from torch import softmax 10 | 11 | path = 'E:\\ptm\\roberta' 12 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 13 | tokenizer = BertTokenizer.from_pretrained(path) 14 | teacher = BertForSequenceClassification.from_pretrained(path, num_labels=4) 15 | # 在线蒸馏,不加载老师的权重 16 | teacher.load_state_dict(torch.load('teacher.bin', map_location=device)) 17 | teacher = teacher.to(device) 18 | teacher.eval() 19 | 20 | 21 | class TextCLS(torch.nn.Module): 22 | # 准备我们需要用到的参数和layer 23 | def __init__(self, 24 | embedding_size, 25 | vocab_size=21128): 26 | super().__init__() 27 | self.embedding = nn.Embedding(vocab_size, embedding_size) 28 | # [batch_size, seq_len, hidden_size] 29 | self.lstm = nn.LSTM(input_size=embedding_size, 30 | hidden_size=256, 31 | num_layers=2, 32 | batch_first=True) 33 | self.dense1 = nn.Linear(256, 100) 34 | self.dense2 = nn.Linear(100, 4) 35 | 36 | # 前向传播,那我们准备好的layer拼接在一起 37 | def forward(self, x): 38 | embedding = self.embedding(x) 39 | # [batch_size, seq_len, hidden_size] 40 | out, _ = self.lstm(embedding) 41 | # 计算mask的和 index = sum(mask)-1 42 | # out[:, index, :] 43 | out = self.dense1(out[:, -1, :]) 44 | out = self.dense2(out) 45 | return out 46 | 47 | 48 | class BaseDataset(Dataset): 49 | def __init__(self, encodings, labels=None): 50 | self.encodings = encodings 51 | self.labels = labels 52 | 53 | def __getitem__(self, idx): 54 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 55 | if self.labels is not None: 56 | item['labels'] = torch.tensor(self.labels[idx]) 57 | return item 58 | 59 | def __len__(self): 60 | return len(self.encodings['input_ids']) 61 | 62 | 63 | def load_data(batch_size=32): 64 | train_df = pd.read_csv('../data/tnews_public/train.csv') 65 | train_text = train_df['text'].tolist() 66 | train_label = train_df['label'].tolist() 67 | train_text = tokenizer(text=train_text, 68 | return_tensors='pt', 69 | truncation=True, 70 | padding=True, 71 | max_length=20) 72 | train_loader = DataLoader(BaseDataset(train_text, train_label), 73 | batch_size, 74 | pin_memory=True if torch.cuda.is_available() else False, 75 | shuffle=False) 76 | 77 | dev_df = pd.read_csv('../data/tnews_public/dev.csv') 78 | dev_text = dev_df['text'].tolist() 79 | dev_label = dev_df['label'].tolist() 80 | dev_text = tokenizer(text=dev_text, 81 | return_tensors='pt', 82 | truncation=True, 83 | padding=True, 84 | max_length=20) 85 | dev_loader = DataLoader(BaseDataset(dev_text, dev_label), 86 | batch_size, 87 | pin_memory=True if torch.cuda.is_available() else False, 88 | shuffle=False) 89 | 90 | return train_loader, dev_loader 91 | 92 | 93 | def CE(pred, label, t=1): 94 | pred = softmax(pred / t, dim=-1) 95 | label = softmax(label / t, dim=-1) 96 | loss = -torch.sum(torch.log(pred) * label) 97 | return loss 98 | 99 | 100 | # 训练模型 101 | def train(): 102 | fix_seed() 103 | 104 | train_data_loader, dev_data_loader = load_data(64) 105 | student = TextCLS(embedding_size=100) 106 | student = student.to(device) 107 | # 优化器要保留老师和学生模型的参数 108 | optimizer = torch.optim.Adam(student.parameters(), lr=0.01) 109 | loss_func = nn.CrossEntropyLoss() 110 | 111 | best_acc = 0 112 | for epoch in range(10): 113 | print('epoch:', epoch + 1) 114 | pbar = tqdm(train_data_loader) 115 | for data in pbar: 116 | optimizer.zero_grad() 117 | 118 | input_ids = data['input_ids'].to(device) 119 | attention_mask = data['attention_mask'].to(device) 120 | labels = data['labels'].to(device).long() 121 | 122 | # 离线蒸馏 123 | # hard target 124 | # 学生模型学习真实的y标 125 | output = student(input_ids) 126 | loss1 = loss_func(output, labels) 127 | 128 | # soft target 129 | # 学生模型学习老师模型的输出结果,提升学生模型的泛化能力 130 | with torch.no_grad(): 131 | outputs = teacher(input_ids, attention_mask=attention_mask, labels=labels) 132 | # outputs = teacher(input_ids, attention_mask=attention_mask, labels=labels) 133 | teacher_out = outputs.logits 134 | loss2 = CE(output, teacher_out, t=2) 135 | 136 | # loss3 = loss2(teacher_out,labels) 137 | 138 | loss = loss1 + 0.25 * loss2 139 | loss.backward() 140 | 141 | optimizer.step() 142 | 143 | pbar.update() 144 | pbar.set_description(f'loss:{loss.item():.4f}') 145 | 146 | pred = [] 147 | label = [] 148 | for data in tqdm(dev_data_loader): 149 | input_ids = data['input_ids'].to(device) 150 | labels = data['labels'].to(device).long() 151 | with torch.no_grad(): 152 | output = student(input_ids) 153 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 154 | label.extend(labels.cpu().numpy()) 155 | acc = accuracy_score(pred, label) 156 | print('dev acc:', acc) 157 | print() 158 | if acc > best_acc: 159 | torch.save(student.state_dict(), 'student.bin') 160 | best_acc = acc 161 | 162 | 163 | if __name__ == '__main__': 164 | train() 165 | -------------------------------------------------------------------------------- /distillation/train_student.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from sklearn.metrics import accuracy_score 3 | import pandas as pd 4 | from torch.utils.data import DataLoader, Dataset 5 | from tqdm import tqdm 6 | from utils import fix_seed 7 | import torch 8 | from transformers import BertTokenizer 9 | from torch import softmax 10 | 11 | path = 'E:\\ptm\\roberta' 12 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 13 | tokenizer = BertTokenizer.from_pretrained(path) 14 | 15 | 16 | class TextCLS(torch.nn.Module): 17 | # 准备我们需要用到的参数和layer 18 | def __init__(self, 19 | embedding_size, 20 | vocab_size=21128): 21 | super().__init__() 22 | self.embedding = nn.Embedding(vocab_size, embedding_size) 23 | # [batch_size, seq_len, hidden_size] 24 | self.lstm = nn.LSTM(input_size=embedding_size, 25 | hidden_size=256, 26 | num_layers=2, 27 | batch_first=True) 28 | self.dense1 = nn.Linear(256, 100) 29 | self.dense2 = nn.Linear(100, 4) 30 | 31 | # 前向传播,那我们准备好的layer拼接在一起 32 | def forward(self, x): 33 | embedding = self.embedding(x) 34 | # [batch_size, seq_len, hidden_size] 35 | out, _ = self.lstm(embedding) 36 | out = self.dense1(out[:, -1, :]) 37 | out = self.dense2(out) 38 | return out 39 | 40 | 41 | class BaseDataset(Dataset): 42 | def __init__(self, encodings, labels=None): 43 | self.encodings = encodings 44 | self.labels = labels 45 | 46 | def __getitem__(self, idx): 47 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 48 | if self.labels is not None: 49 | item['labels'] = torch.tensor(self.labels[idx]) 50 | return item 51 | 52 | def __len__(self): 53 | return len(self.encodings['input_ids']) 54 | 55 | 56 | def load_data(batch_size=32): 57 | train_df = pd.read_csv('../data/tnews_public/train.csv') 58 | train_text = train_df['text'].tolist() 59 | train_label = train_df['label'].tolist() 60 | train_text = tokenizer(text=train_text, 61 | return_tensors='pt', 62 | truncation=True, 63 | padding=True, 64 | max_length=32) 65 | train_loader = DataLoader(BaseDataset(train_text, train_label), 66 | batch_size, 67 | pin_memory=True if torch.cuda.is_available() else False, 68 | shuffle=False) 69 | 70 | dev_df = pd.read_csv('../data/tnews_public/dev.csv') 71 | dev_text = dev_df['text'].tolist() 72 | dev_label = dev_df['label'].tolist() 73 | dev_text = tokenizer(text=dev_text, 74 | return_tensors='pt', 75 | truncation=True, 76 | padding=True, 77 | max_length=32) 78 | dev_loader = DataLoader(BaseDataset(dev_text, dev_label), 79 | batch_size, 80 | pin_memory=True if torch.cuda.is_available() else False, 81 | shuffle=False) 82 | 83 | return train_loader, dev_loader 84 | 85 | 86 | # 训练模型 87 | def train(): 88 | fix_seed() 89 | 90 | train_data_loader, dev_data_loader = load_data(32) 91 | student = TextCLS(embedding_size=100) 92 | student = student.to(device) 93 | optimizer = torch.optim.Adam(student.parameters(), lr=0.01) 94 | loss_func = nn.CrossEntropyLoss() 95 | 96 | best_acc = 0 97 | for epoch in range(20): 98 | print('epoch:', epoch + 1) 99 | pred = [] 100 | label = [] 101 | pbar = tqdm(train_data_loader) 102 | for data in pbar: 103 | optimizer.zero_grad() 104 | 105 | input_ids = data['input_ids'].to(device) 106 | labels = data['labels'].to(device) 107 | 108 | output = student(input_ids) 109 | loss = loss_func(output, labels) 110 | 111 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 112 | label.extend(labels) 113 | loss.backward() 114 | 115 | optimizer.step() 116 | 117 | pbar.update() 118 | pbar.set_description(f'loss:{loss.item():.4f}') 119 | 120 | pred = [] 121 | label = [] 122 | for data in tqdm(dev_data_loader): 123 | input_ids = data['input_ids'].to(device) 124 | labels = data['labels'].to(device).long() 125 | with torch.no_grad(): 126 | output = student(input_ids) 127 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 128 | label.extend(labels.cpu().numpy()) 129 | acc = accuracy_score(pred, label) 130 | print('dev acc:', acc) 131 | print() 132 | if acc > best_acc: 133 | torch.save(student.state_dict(), 'model.bin') 134 | best_acc = acc 135 | 136 | 137 | if __name__ == '__main__': 138 | train() 139 | -------------------------------------------------------------------------------- /distillation/train_teacher.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | from transformers import BertForSequenceClassification, BertTokenizer 3 | from torch.utils.data import DataLoader, Dataset 4 | from tqdm import tqdm 5 | from utils import fix_seed 6 | import torch 7 | import pandas as pd 8 | 9 | path = 'E:\\ptm\\roberta' 10 | tokenizer = BertTokenizer.from_pretrained(path) 11 | 12 | 13 | class BaseDataset(Dataset): 14 | def __init__(self, encodings, labels=None): 15 | self.encodings = encodings 16 | self.labels = labels 17 | 18 | def __getitem__(self, idx): 19 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 20 | if self.labels is not None: 21 | item['labels'] = torch.tensor(self.labels[idx]) 22 | return item 23 | 24 | def __len__(self): 25 | return len(self.encodings['input_ids']) 26 | 27 | 28 | def load_data(batch_size=32): 29 | train_df = pd.read_csv('../data/tnews_public/train.csv') 30 | train_text = train_df['text'].tolist() 31 | train_label = train_df['label'].tolist() 32 | train_text = tokenizer(text=train_text, 33 | return_tensors='pt', 34 | truncation=True, 35 | padding=True, 36 | max_length=32) 37 | train_loader = DataLoader(BaseDataset(train_text, train_label), 38 | batch_size, 39 | pin_memory=True if torch.cuda.is_available() else False, 40 | shuffle=False) 41 | 42 | dev_df = pd.read_csv('../data/tnews_public/dev.csv') 43 | dev_text = dev_df['text'].tolist() 44 | dev_label = dev_df['label'].tolist() 45 | dev_text = tokenizer(text=dev_text, 46 | return_tensors='pt', 47 | truncation=True, 48 | padding=True, 49 | max_length=32) 50 | 51 | dev_loader = DataLoader(BaseDataset(dev_text, dev_label), 52 | batch_size, 53 | pin_memory=True if torch.cuda.is_available() else False, 54 | shuffle=False) 55 | 56 | return train_loader, dev_loader 57 | 58 | 59 | # 训练模型 60 | def train(): 61 | fix_seed() 62 | 63 | train_data_loader, dev_data_loader = load_data(32) 64 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 65 | 66 | model = BertForSequenceClassification.from_pretrained(path, num_labels=4) 67 | model = model.to(device) 68 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) 69 | 70 | best_acc = 0 71 | for epoch in range(5): 72 | print('epoch:', epoch + 1) 73 | pred = [] 74 | label = [] 75 | pbar = tqdm(train_data_loader) 76 | for data in pbar: 77 | optimizer.zero_grad() 78 | 79 | input_ids = data['input_ids'].to(device) 80 | attention_mask = data['attention_mask'].to(device) 81 | labels = data['labels'].to(device).long() 82 | 83 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 84 | output = outputs.logits.argmax(1).cpu().numpy() 85 | pred.extend(output) 86 | label.extend(labels.cpu().numpy()) 87 | loss = outputs.loss 88 | loss.backward() 89 | 90 | optimizer.step() 91 | 92 | pbar.update() 93 | pbar.set_description(f'loss:{loss.item():.4f}') 94 | 95 | acc = accuracy_score(pred, label) 96 | print('train acc:', acc) 97 | 98 | pred = [] 99 | label = [] 100 | for data in tqdm(dev_data_loader): 101 | input_ids = data['input_ids'].to(device) 102 | attention_mask = data['attention_mask'].to(device) 103 | labels = data['labels'].to(device).long() 104 | with torch.no_grad(): 105 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 106 | output = outputs.logits.argmax(1).cpu().numpy() 107 | pred.extend(output) 108 | label.extend(labels.cpu().numpy()) 109 | acc = accuracy_score(pred, label) 110 | print('dev acc:', acc) 111 | print() 112 | if acc > best_acc: 113 | torch.save(model.state_dict(), 'teacher.bin') 114 | best_acc = acc 115 | 116 | 117 | if __name__ == '__main__': 118 | train() 119 | -------------------------------------------------------------------------------- /elmoformanylangs/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from .elmo import Embedder 3 | 4 | 5 | import logging 6 | logger = logging.getLogger('elmoformanylangs') 7 | 8 | # if the client application hasn't set the log level, we set it 9 | # ourselves to INFO 10 | if logger.level == 0: 11 | logger.setLevel(logging.INFO) 12 | 13 | log_handler = logging.StreamHandler() 14 | log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)s: %(message)s") 15 | log_handler.setFormatter(log_formatter) 16 | 17 | # also, if the client hasn't added any handlers for this logger 18 | # (or a default handler), we add a handler of our own 19 | # 20 | # client can later do 21 | # logger.removeHandler(stanza.log_handler) 22 | if not logger.hasHandlers(): 23 | logger.addHandler(log_handler) 24 | -------------------------------------------------------------------------------- /elmoformanylangs/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import sys 6 | import codecs 7 | import argparse 8 | import logging 9 | import json 10 | import torch 11 | from .modules.embedding_layer import EmbeddingLayer 12 | from .utils import dict2namedtuple 13 | from .frontend import Model 14 | from .frontend import create_batches 15 | import numpy as np 16 | import h5py 17 | 18 | logger = logging.getLogger('elmoformanylangs') 19 | 20 | 21 | def read_corpus(path, max_chars=None): 22 | """ 23 | read raw text file. The format of the input is like, one sentence per line 24 | words are separated by '\t' 25 | 26 | :param path: 27 | :param max_chars: int, the number of maximum characters in a word, this 28 | parameter is used when the ptm is configured with CNN word encoder. 29 | :return: 30 | """ 31 | dataset = [] 32 | textset = [] 33 | with codecs.open(path, 'r', encoding='utf-8') as fin: 34 | for line in fin.read().strip().split('\n'): 35 | data = [''] 36 | text = [] 37 | for token in line.split('\t'): 38 | text.append(token) 39 | if max_chars is not None and len(token) + 2 > max_chars: 40 | token = token[:max_chars - 2] 41 | data.append(token) 42 | data.append('') 43 | dataset.append(data) 44 | textset.append(text) 45 | return dataset, textset 46 | 47 | 48 | def read_conll_corpus(path, max_chars=None): 49 | """ 50 | read text in CoNLL-U format. 51 | 52 | :param path: 53 | :param max_chars: 54 | :return: 55 | """ 56 | dataset = [] 57 | textset = [] 58 | with codecs.open(path, 'r', encoding='utf-8') as fin: 59 | for payload in fin.read().strip().split('\n\n'): 60 | data = [''] 61 | text = [] 62 | lines = payload.splitlines() 63 | body = [line for line in lines if not line.startswith('#')] 64 | for line in body: 65 | fields = line.split('\t') 66 | num, token = fields[0], fields[1] 67 | if '-' in num or '.' in num: 68 | continue 69 | text.append(token) 70 | if max_chars is not None and len(token) + 2 > max_chars: 71 | token = token[:max_chars - 2] 72 | data.append(token) 73 | data.append('') 74 | dataset.append(data) 75 | textset.append(text) 76 | return dataset, textset 77 | 78 | 79 | def read_conll_char_corpus(path, max_chars=None): 80 | """ 81 | 82 | :param path: 83 | :param max_chars: 84 | :return: 85 | """ 86 | dataset = [] 87 | textset = [] 88 | with codecs.open(path, 'r', encoding='utf-8') as fin: 89 | for payload in fin.read().strip().split('\n\n'): 90 | data = [''] 91 | text = [] 92 | lines = payload.splitlines() 93 | body = [line for line in lines if not line.startswith('#')] 94 | for line in body: 95 | fields = line.split('\t') 96 | num, token = fields[0], fields[1] 97 | if '-' in num or '.' in num: 98 | continue 99 | for ch in token: 100 | text.append(ch) 101 | if max_chars is not None and len(ch) + 2 > max_chars: 102 | ch = ch[:max_chars - 2] 103 | data.append(ch) 104 | data.append('') 105 | dataset.append(data) 106 | textset.append(text) 107 | return dataset, textset 108 | 109 | 110 | def read_conll_char_vi_corpus(path, max_chars=None): 111 | """ 112 | 113 | :param path: 114 | :param max_chars: 115 | :return: 116 | """ 117 | dataset = [] 118 | textset = [] 119 | with codecs.open(path, 'r', encoding='utf-8') as fin: 120 | for payload in fin.read().strip().split('\n\n'): 121 | data = [''] 122 | text = [] 123 | lines = payload.splitlines() 124 | body = [line for line in lines if not line.startswith('#')] 125 | for line in body: 126 | fields = line.split('\t') 127 | num, token = fields[0], fields[1] 128 | if '-' in num or '.' in num: 129 | continue 130 | for ch in token.split(): 131 | text.append(ch) 132 | if max_chars is not None and len(ch) + 2 > max_chars: 133 | ch = ch[:max_chars - 2] 134 | data.append(ch) 135 | data.append('') 136 | dataset.append(data) 137 | textset.append(text) 138 | return dataset, textset 139 | 140 | 141 | def test_main(): 142 | # Configurations 143 | cmd = argparse.ArgumentParser('The testing components of') 144 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') 145 | cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'), 146 | help='the input format.') 147 | cmd.add_argument("--input", help="the path to the raw text file.") 148 | cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).' 149 | ' Use comma to separate the format identifiers,' 150 | ' like \'--output_format=hdf5,plain\'') 151 | cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of ' 152 | '..') 153 | cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM ' 154 | 'hidden layer, 2 for the second LSTM hidden layer, -1 for an average' 155 | 'of 3 layers.') 156 | cmd.add_argument("--ptm", required=True, help="the path to the ptm.") 157 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') 158 | args = cmd.parse_args(sys.argv[2:]) 159 | 160 | if args.gpu >= 0: 161 | torch.cuda.set_device(args.gpu) 162 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 163 | # load the ptm configurations 164 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8'))) 165 | 166 | with open(os.path.join(args.model, args2.config_path), 'r') as fin: 167 | config = json.load(fin) 168 | 169 | # For the ptm trained with character-based word encoder. 170 | if config['token_embedder']['char_dim'] > 0: 171 | char_lexicon = {} 172 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: 173 | for line in fpi: 174 | tokens = line.strip().split('\t') 175 | if len(tokens) == 1: 176 | tokens.insert(0, '\u3000') 177 | token, i = tokens 178 | char_lexicon[token] = int(i) 179 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None) 180 | logger.info('char embedding size: ' + str(len(char_emb_layer.word2id))) 181 | else: 182 | char_lexicon = None 183 | char_emb_layer = None 184 | 185 | # For the ptm trained with word form word encoder. 186 | if config['token_embedder']['word_dim'] > 0: 187 | word_lexicon = {} 188 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: 189 | for line in fpi: 190 | tokens = line.strip().split('\t') 191 | if len(tokens) == 1: 192 | tokens.insert(0, '\u3000') 193 | token, i = tokens 194 | word_lexicon[token] = int(i) 195 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) 196 | logger.info('word embedding size: ' + str(len(word_emb_layer.word2id))) 197 | else: 198 | word_lexicon = None 199 | word_emb_layer = None 200 | 201 | # instantiate the ptm 202 | model = Model(config, word_emb_layer, char_emb_layer, use_cuda) 203 | 204 | if use_cuda: 205 | model.cuda() 206 | 207 | logger.info(str(model)) 208 | model.load_model(args.model) 209 | 210 | # read test data according to input format 211 | read_function = read_corpus if args.input_format == 'plain' else ( 212 | read_conll_corpus if args.input_format == 'conll' else ( 213 | read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus)) 214 | 215 | if config['token_embedder']['name'].lower() == 'cnn': 216 | test, text = read_function(args.input, config['token_embedder']['max_characters_per_token']) 217 | else: 218 | test, text = read_function(args.input) 219 | 220 | # create test batches from the input data. 221 | test_w, test_c, test_lens, test_masks, test_text = create_batches( 222 | test, args.batch_size, word_lexicon, char_lexicon, config, text=text) 223 | 224 | # configure the ptm to evaluation mode. 225 | model.eval() 226 | 227 | sent_set = set() 228 | cnt = 0 229 | 230 | output_formats = args.output_format.split(',') 231 | output_layers = map(int, args.output_layer.split(',')) 232 | 233 | handlers = {} 234 | for output_format in output_formats: 235 | if output_format not in ('hdf5', 'txt'): 236 | print('Unknown output_format: {0}'.format(output_format)) 237 | continue 238 | for output_layer in output_layers: 239 | filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format) 240 | handlers[output_format, output_layer] = \ 241 | h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w') 242 | 243 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): 244 | output = model.forward(w, c, masks) 245 | for i, text in enumerate(texts): 246 | sent = '\t'.join(text) 247 | sent = sent.replace('.', '$period$') 248 | sent = sent.replace('/', '$backslash$') 249 | if sent in sent_set: 250 | continue 251 | sent_set.add(sent) 252 | if config['encoder']['name'].lower() == 'lstm': 253 | data = output[i, 1:lens[i]-1, :].data 254 | if use_cuda: 255 | data = data.cpu() 256 | data = data.numpy() 257 | elif config['encoder']['name'].lower() == 'elmo': 258 | data = output[:, i, 1:lens[i]-1, :].data 259 | if use_cuda: 260 | data = data.cpu() 261 | data = data.numpy() 262 | 263 | for (output_format, output_layer) in handlers: 264 | fout = handlers[output_format, output_layer] 265 | if output_layer == -1: 266 | payload = np.average(data, axis=0) 267 | else: 268 | payload = data[output_layer] 269 | if output_format == 'hdf5': 270 | fout.create_dataset(sent, payload.shape, dtype='float32', data=payload) 271 | else: 272 | for word, row in zip(text, payload): 273 | print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout) 274 | print('', file=fout) 275 | 276 | cnt += 1 277 | if cnt % 1000 == 0: 278 | logger.info('Finished {0} sentences.'.format(cnt)) 279 | for _, handler in handlers.items(): 280 | handler.close() 281 | 282 | 283 | if __name__ == "__main__": 284 | if len(sys.argv) > 1 and sys.argv[1] == 'test': 285 | test_main() 286 | else: 287 | print('Usage: {0} [test] [options]'.format(sys.argv[0]), file=sys.stderr) 288 | -------------------------------------------------------------------------------- /elmoformanylangs/configs/cnn_0_100_512_4096_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "encoder": { 3 | "name": "elmo", 4 | "projection_dim": 512, 5 | "cell_clip": 3, 6 | "proj_clip": 3, 7 | "dim": 4096, 8 | "n_layers": 2 9 | }, 10 | 11 | "token_embedder": { 12 | "name": "cnn", 13 | "activation": "relu", 14 | "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 15 | "n_highway": 2, 16 | "word_dim": 100, 17 | "char_dim": 50, 18 | "max_characters_per_token": 50 19 | }, 20 | 21 | "classifier": { 22 | "name": "sampled_softmax", 23 | "n_samples": 8192 24 | }, 25 | "dropout": 0.1 26 | } 27 | -------------------------------------------------------------------------------- /elmoformanylangs/configs/cnn_50_100_512_4096_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "encoder": { 3 | "name": "elmo", 4 | "projection_dim": 512, 5 | "cell_clip": 3, 6 | "proj_clip": 3, 7 | "dim": 4096, 8 | "n_layers": 2 9 | }, 10 | 11 | "token_embedder": { 12 | "name": "cnn", 13 | "activation": "relu", 14 | "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 15 | "n_highway": 2, 16 | "word_dim": 100, 17 | "char_dim": 50, 18 | "max_characters_per_token": 50 19 | }, 20 | 21 | "classifier": { 22 | "name": "sampled_softmax", 23 | "n_samples": 8192 24 | }, 25 | "dropout": 0.1 26 | } 27 | -------------------------------------------------------------------------------- /elmoformanylangs/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | import codecs 4 | import numpy as np 5 | 6 | 7 | def pad(sequences, pad_token='', pad_left=False): 8 | """ 9 | input sequences is a list of text sequence [[str]] 10 | pad each text sequence to the length of the longest 11 | 12 | :param sequences: 13 | :param pad_token: 14 | :param pad_left: 15 | :return: 16 | """ 17 | # max_len = max(5,max(len(seq) for seq in sequences)) 18 | max_len = max(len(seq) for seq in sequences) 19 | if pad_left: 20 | return [[pad_token]*(max_len-len(seq)) + seq for seq in sequences] 21 | return [seq + [pad_token]*(max_len-len(seq)) for seq in sequences] 22 | 23 | 24 | def load_embedding_npz(path): 25 | data = np.load(path) 26 | return [str(w) for w in data['words']], data['vals'] 27 | 28 | 29 | def load_embedding_txt(path): 30 | words = [] 31 | vals = [] 32 | with codecs.open(path, 'r', encoding='utf-8') as fin: 33 | fin.readline() 34 | for line in fin: 35 | line = line.strip() 36 | if line: 37 | parts = line.split() 38 | words.append(parts[0]) 39 | vals += [float(x) for x in parts[1:]] # equal to append 40 | return words, np.asarray(vals).reshape(len(words), -1) # reshape 41 | 42 | 43 | def load_embedding(path): 44 | if path.endswith(".npz"): 45 | return load_embedding_npz(path) 46 | else: 47 | return load_embedding_txt(path) 48 | -------------------------------------------------------------------------------- /elmoformanylangs/elmo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import codecs 6 | import random 7 | import logging 8 | import json 9 | import torch 10 | from .modules.embedding_layer import EmbeddingLayer 11 | from .utils import dict2namedtuple 12 | from .frontend import create_one_batch 13 | from .frontend import Model 14 | import numpy as np 15 | 16 | logger = logging.getLogger('elmoformanylangs') 17 | 18 | 19 | def read_list(sents, max_chars=None): 20 | """ 21 | read raw text file. The format of the input is like, one sentence per line 22 | words are separated by '\t' 23 | 24 | :param path: 25 | :param max_chars: int, the number of maximum characters in a word, this 26 | parameter is used when the ptm is configured with CNN word encoder. 27 | :return: 28 | """ 29 | dataset = [] 30 | textset = [] 31 | for sent in sents: 32 | data = [''] 33 | text = [] 34 | for token in sent: 35 | text.append(token) 36 | if max_chars is not None and len(token) + 2 > max_chars: 37 | token = token[:max_chars - 2] 38 | data.append(token) 39 | data.append('') 40 | dataset.append(data) 41 | textset.append(text) 42 | return dataset, textset 43 | 44 | 45 | def recover(li, ind): 46 | # li[piv], ind = torch.sort(li[piv], dim=0, descending=(not unsort)) 47 | dummy = list(range(len(ind))) 48 | dummy.sort(key=lambda l: ind[l]) 49 | li = [li[i] for i in dummy] 50 | return li 51 | 52 | 53 | # shuffle training examples and create mini-batches 54 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=False, sort=True, text=None): 55 | ind = list(range(len(x))) 56 | lst = perm or list(range(len(x))) 57 | if shuffle: 58 | random.shuffle(lst) 59 | 60 | if sort: 61 | lst.sort(key=lambda l: -len(x[l])) 62 | 63 | x = [x[i] for i in lst] 64 | ind = [ind[i] for i in lst] 65 | if text is not None: 66 | text = [text[i] for i in lst] 67 | 68 | sum_len = 0.0 69 | batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], [] 70 | size = batch_size 71 | nbatch = (len(x) - 1) // size + 1 72 | for i in range(nbatch): 73 | start_id, end_id = i * size, (i + 1) * size 74 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 75 | sum_len += sum(blens) 76 | batches_w.append(bw) 77 | batches_c.append(bc) 78 | batches_lens.append(blens) 79 | batches_masks.append(bmasks) 80 | batches_ind.append(ind[start_id: end_id]) 81 | if text is not None: 82 | batches_text.append(text[start_id: end_id]) 83 | 84 | if sort: 85 | perm = list(range(nbatch)) 86 | random.shuffle(perm) 87 | batches_w = [batches_w[i] for i in perm] 88 | batches_c = [batches_c[i] for i in perm] 89 | batches_lens = [batches_lens[i] for i in perm] 90 | batches_masks = [batches_masks[i] for i in perm] 91 | batches_ind = [batches_ind[i] for i in perm] 92 | if text is not None: 93 | batches_text = [batches_text[i] for i in perm] 94 | 95 | logger.info("{} batches, avg len: {:.1f}".format( 96 | nbatch, sum_len / len(x))) 97 | recover_ind = [item for sublist in batches_ind for item in sublist] 98 | if text is not None: 99 | return batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind 100 | return batches_w, batches_c, batches_lens, batches_masks, recover_ind 101 | 102 | 103 | class Embedder(object): 104 | def __init__(self, model_dir, batch_size=64): 105 | self.model_dir = model_dir 106 | self.model, self.config = self.get_model() 107 | self.batch_size = batch_size 108 | 109 | def get_model(self): 110 | # torch.cuda.set_device(1) 111 | self.use_cuda = torch.cuda.is_available() 112 | # load the ptm configurations 113 | args2 = dict2namedtuple(json.load(codecs.open( 114 | os.path.join(self.model_dir, 'config.json'), 'r', encoding='utf-8'))) 115 | 116 | config_path = os.path.join(self.model_dir, args2.config_path) 117 | # Some of the available models may have the config in the 118 | # ptm dir, but the path given in the config directory was an 119 | # absolute path. 120 | if not os.path.exists(config_path): 121 | config_path = os.path.join(self.model_dir, 122 | os.path.split(config_path)[1]) 123 | logger.warning("Could not find config. Trying " + config_path) 124 | # In many cases, such as the publicly available English ptm, 125 | # the config is one of the default provided configs in 126 | # elmoformanylangs/configs 127 | if not os.path.exists(config_path): 128 | config_path = os.path.join(os.path.split(__file__)[0], "configs", 129 | os.path.split(config_path)[1]) 130 | logger.warning("Could not find config. Trying " + config_path) 131 | 132 | if not os.path.exists(config_path): 133 | raise FileNotFoundError("Could not find the ptm config in either the ptm directory " 134 | "or the default configs. Path in config file: %s" % args2.config_path) 135 | 136 | with open(config_path, 'r') as fin: 137 | config = json.load(fin) 138 | 139 | # For the ptm trained with character-based word encoder. 140 | if config['token_embedder']['char_dim'] > 0: 141 | self.char_lexicon = {} 142 | with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: 143 | for line in fpi: 144 | tokens = line.strip().split('\t') 145 | if len(tokens) == 1: 146 | tokens.insert(0, '\u3000') 147 | token, i = tokens 148 | self.char_lexicon[token] = int(i) 149 | char_emb_layer = EmbeddingLayer( 150 | config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None) 151 | logger.info('char embedding size: ' + 152 | str(len(char_emb_layer.word2id))) 153 | else: 154 | self.char_lexicon = None 155 | char_emb_layer = None 156 | 157 | # For the ptm trained with word form word encoder. 158 | if config['token_embedder']['word_dim'] > 0: 159 | self.word_lexicon = {} 160 | with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi: 161 | for line in fpi: 162 | tokens = line.strip().split('\t') 163 | if len(tokens) == 1: 164 | tokens.insert(0, '\u3000') 165 | token, i = tokens 166 | self.word_lexicon[token] = int(i) 167 | word_emb_layer = EmbeddingLayer( 168 | config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None) 169 | logger.info('word embedding size: ' + 170 | str(len(word_emb_layer.word2id))) 171 | else: 172 | self.word_lexicon = None 173 | word_emb_layer = None 174 | 175 | # instantiate the ptm 176 | model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda) 177 | 178 | if self.use_cuda: 179 | model.cuda() 180 | 181 | logger.info(str(model)) 182 | model.load_model(self.model_dir) 183 | 184 | # read test data according to input format 185 | 186 | # configure the ptm to evaluation mode. 187 | model.eval() 188 | return model, config 189 | 190 | def sents2elmo(self, sents, output_layer=-1): 191 | read_function = read_list 192 | 193 | if self.config['token_embedder']['name'].lower() == 'cnn': 194 | test, text = read_function(sents, self.config['token_embedder']['max_characters_per_token']) 195 | else: 196 | test, text = read_function(sents) 197 | 198 | # create test batches from the input data. 199 | test_w, test_c, test_lens, test_masks, test_text, recover_ind = create_batches( 200 | test, self.batch_size, self.word_lexicon, self.char_lexicon, self.config, text=text) 201 | 202 | cnt = 0 203 | 204 | after_elmo = [] 205 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): 206 | output = self.model.forward(w, c, masks) 207 | for i, text in enumerate(texts): 208 | 209 | if self.config['encoder']['name'].lower() == 'lstm': 210 | data = output[i, 1:lens[i]-1, :].data 211 | if self.use_cuda: 212 | data = data.cpu() 213 | data = data.numpy() 214 | elif self.config['encoder']['name'].lower() == 'elmo': 215 | data = output[:, i, 1:lens[i]-1, :].data 216 | if self.use_cuda: 217 | data = data.cpu() 218 | data = data.numpy() 219 | 220 | if output_layer == -1: 221 | payload = np.average(data, axis=0) 222 | elif output_layer == -2: 223 | payload = data 224 | else: 225 | payload = data[output_layer] 226 | after_elmo.append(payload) 227 | 228 | cnt += 1 229 | if cnt % 1000 == 0: 230 | logger.info('Finished {0} sentences.'.format(cnt)) 231 | 232 | after_elmo = recover(after_elmo, recover_ind) 233 | return after_elmo 234 | -------------------------------------------------------------------------------- /elmoformanylangs/frontend.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import logging 7 | from torch.autograd import Variable 8 | from .modules.elmo import ElmobiLm 9 | from .modules.lstm import LstmbiLm 10 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder 11 | 12 | logger = logging.getLogger('elmoformanylangs') 13 | 14 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True): 15 | """ 16 | Create one batch of input. 17 | 18 | :param x: List[List[str]] 19 | :param word2id: Dict | None 20 | :param char2id: Dict | None 21 | :param config: Dict 22 | :param oov: str, the form of OOV token. 23 | :param pad: str, the form of padding token. 24 | :param sort: bool, specify whether sorting the sentences by their lengths. 25 | :return: 26 | """ 27 | batch_size = len(x) 28 | # lst represents the order of sentences 29 | lst = list(range(batch_size)) 30 | if sort: 31 | lst.sort(key=lambda l: -len(x[l])) 32 | 33 | # shuffle the sentences by 34 | x = [x[i] for i in lst] 35 | lens = [len(x[i]) for i in lst] 36 | max_len = max(lens) 37 | 38 | # get a batch of word id whose size is (batch x max_len) 39 | if word2id is not None: 40 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None) 41 | assert oov_id is not None and pad_id is not None 42 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id) 43 | for i, x_i in enumerate(x): 44 | for j, x_ij in enumerate(x_i): 45 | batch_w[i][j] = word2id.get(x_ij, oov_id) 46 | else: 47 | batch_w = None 48 | 49 | # get a batch of character id whose size is (batch x max_len x max_chars) 50 | if char2id is not None: 51 | bow_id, eow_id, oov_id, pad_id = [char2id.get(key, None) for key in ('', '', oov, pad)] 52 | 53 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None 54 | 55 | if config['token_embedder']['name'].lower() == 'cnn': 56 | max_chars = config['token_embedder']['max_characters_per_token'] 57 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars 58 | elif config['token_embedder']['name'].lower() == 'lstm': 59 | # counting the and 60 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2 61 | else: 62 | raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name'])) 63 | 64 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id) 65 | 66 | for i, x_i in enumerate(x): 67 | for j, x_ij in enumerate(x_i): 68 | batch_c[i][j][0] = bow_id 69 | if x_ij == '' or x_ij == '': 70 | batch_c[i][j][1] = char2id.get(x_ij) 71 | batch_c[i][j][2] = eow_id 72 | else: 73 | for k, c in enumerate(x_ij): 74 | batch_c[i][j][k + 1] = char2id.get(c, oov_id) 75 | batch_c[i][j][len(x_ij) + 1] = eow_id 76 | else: 77 | batch_c = None 78 | 79 | # mask[0] is the matrix (batch x max_len) indicating whether 80 | # there is an id is valid (not a padding) in this batch. 81 | # mask[1] stores the flattened ids indicating whether there is a valid 82 | # previous token 83 | # mask[2] stores the flattened ids indicating whether there is a valid 84 | # next token 85 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []] 86 | 87 | for i, x_i in enumerate(x): 88 | for j in range(len(x_i)): 89 | masks[0][i][j] = 1 90 | if j + 1 < len(x_i): 91 | masks[1].append(i * max_len + j) 92 | if j > 0: 93 | masks[2].append(i * max_len + j) 94 | 95 | assert len(masks[1]) <= batch_size * max_len 96 | assert len(masks[2]) <= batch_size * max_len 97 | 98 | masks[1] = torch.LongTensor(masks[1]) 99 | masks[2] = torch.LongTensor(masks[2]) 100 | 101 | return batch_w, batch_c, lens, masks 102 | 103 | 104 | # shuffle training examples and create mini-batches 105 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, text=None): 106 | """ 107 | 108 | :param x: List[List[str]] 109 | :param batch_size: 110 | :param word2id: 111 | :param char2id: 112 | :param config: 113 | :param perm: 114 | :param shuffle: 115 | :param sort: 116 | :param text: 117 | :return: 118 | """ 119 | lst = perm or list(range(len(x))) 120 | if shuffle: 121 | random.shuffle(lst) 122 | 123 | if sort: 124 | lst.sort(key=lambda l: -len(x[l])) 125 | 126 | x = [x[i] for i in lst] 127 | if text is not None: 128 | text = [text[i] for i in lst] 129 | 130 | sum_len = 0.0 131 | batches_w, batches_c, batches_lens, batches_masks, batches_text = [], [], [], [], [] 132 | size = batch_size 133 | nbatch = (len(x) - 1) // size + 1 134 | for i in range(nbatch): 135 | start_id, end_id = i * size, (i + 1) * size 136 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 137 | sum_len += sum(blens) 138 | batches_w.append(bw) 139 | batches_c.append(bc) 140 | batches_lens.append(blens) 141 | batches_masks.append(bmasks) 142 | if text is not None: 143 | batches_text.append(text[start_id: end_id]) 144 | 145 | if sort: 146 | perm = list(range(nbatch)) 147 | random.shuffle(perm) 148 | batches_w = [batches_w[i] for i in perm] 149 | batches_c = [batches_c[i] for i in perm] 150 | batches_lens = [batches_lens[i] for i in perm] 151 | batches_masks = [batches_masks[i] for i in perm] 152 | if text is not None: 153 | batches_text = [batches_text[i] for i in perm] 154 | 155 | logger.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x))) 156 | if text is not None: 157 | return batches_w, batches_c, batches_lens, batches_masks, batches_text 158 | return batches_w, batches_c, batches_lens, batches_masks 159 | 160 | 161 | class Model(nn.Module): 162 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False): 163 | super(Model, self).__init__() 164 | self.use_cuda = use_cuda 165 | self.config = config 166 | 167 | if config['token_embedder']['name'].lower() == 'cnn': 168 | self.token_embedder = ConvTokenEmbedder( 169 | config, word_emb_layer, char_emb_layer, use_cuda) 170 | elif config['token_embedder']['name'].lower() == 'lstm': 171 | self.token_embedder = LstmTokenEmbedder( 172 | config, word_emb_layer, char_emb_layer, use_cuda) 173 | 174 | if config['encoder']['name'].lower() == 'elmo': 175 | self.encoder = ElmobiLm(config, use_cuda) 176 | elif config['encoder']['name'].lower() == 'lstm': 177 | self.encoder = LstmbiLm(config, use_cuda) 178 | 179 | self.output_dim = config['encoder']['projection_dim'] 180 | 181 | def forward(self, word_inp, chars_package, mask_package): 182 | """ 183 | 184 | :param word_inp: 185 | :param chars_package: 186 | :param mask_package: 187 | :return: 188 | """ 189 | token_embedding = self.token_embedder(word_inp, chars_package, (mask_package[0].size(0), mask_package[0].size(1))) 190 | if self.config['encoder']['name'] == 'elmo': 191 | mask = Variable(mask_package[0]).cuda() if self.use_cuda else Variable(mask_package[0]) 192 | encoder_output = self.encoder(token_embedding, mask) 193 | sz = encoder_output.size() 194 | token_embedding = torch.cat( 195 | [token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3]) 196 | encoder_output = torch.cat( 197 | [token_embedding, encoder_output], dim=0) 198 | elif self.config['encoder']['name'] == 'lstm': 199 | encoder_output = self.encoder(token_embedding) 200 | else: 201 | raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name'])) 202 | 203 | return encoder_output 204 | 205 | def load_model(self, path): 206 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'), 207 | map_location=lambda storage, loc: storage)) 208 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'), 209 | map_location=lambda storage, loc: storage)) 210 | -------------------------------------------------------------------------------- /elmoformanylangs/main.py: -------------------------------------------------------------------------------- 1 | from elmoformanylangs import Embedder 2 | import jieba 3 | 4 | sentence = '我爱自然语言处理' 5 | 6 | segment = list(jieba.cut(sentence)) 7 | print(segment) 8 | model = Embedder('../ptm/elmo') 9 | vec = model.sents2elmo([segment]) 10 | print(vec) 11 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/elmoformanylangs/modules/__init__.py -------------------------------------------------------------------------------- /elmoformanylangs/modules/classify_layer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | 8 | class SoftmaxLayer(nn.Module): 9 | """ Naive softmax-layer """ 10 | def __init__(self, output_dim, n_class): 11 | """ 12 | 13 | :param output_dim: int 14 | :param n_class: int 15 | """ 16 | super(SoftmaxLayer, self).__init__() 17 | self.hidden2tag = nn.Linear(output_dim, n_class) 18 | self.criterion = nn.CrossEntropyLoss(size_average=False) 19 | 20 | def forward(self, x, y): 21 | """ 22 | 23 | :param x: torch.Tensor 24 | :param y: torch.Tensor 25 | :return: 26 | """ 27 | tag_scores = self.hidden2tag(x) 28 | return self.criterion(tag_scores, y) 29 | 30 | 31 | class SampledSoftmaxLayer(nn.Module): 32 | """ 33 | 34 | """ 35 | def __init__(self, output_dim, n_class, n_samples, use_cuda): 36 | """ 37 | 38 | :param output_dim: 39 | :param n_class: 40 | :param n_samples: 41 | :param use_cuda: 42 | """ 43 | super(SampledSoftmaxLayer, self).__init__() 44 | self.n_samples = n_samples 45 | self.n_class = n_class 46 | self.use_cuda = use_cuda 47 | self.criterion = nn.CrossEntropyLoss(size_average=False) 48 | self.negative_samples = [] 49 | self.word_to_column = {0: 0} 50 | 51 | self.all_word = [] 52 | self.all_word_to_column = {0: 0} 53 | 54 | self.column_emb = nn.Embedding(n_class, output_dim) 55 | self.column_emb.weight.data.uniform_(-0.25, 0.25) 56 | 57 | self.column_bias = nn.Embedding(n_class, 1) 58 | self.column_bias.weight.data.uniform_(-0.25, 0.25) 59 | 60 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1)) 61 | self.oov_column.data.uniform_(-0.25, 0.25) 62 | 63 | def forward(self, x, y): 64 | if self.training: 65 | for i in range(y.size(0)): 66 | y[i] = self.word_to_column.get(y[i].tolist()) 67 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0) 68 | for word in self.negative_samples: 69 | samples[self.word_to_column[word]] = word 70 | else: 71 | for i in range(y.size(0)): 72 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0) 73 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0) 74 | for word in self.all_word: 75 | samples[self.all_word_to_column[word]] = word 76 | 77 | if self.use_cuda: 78 | samples = samples.cuda() 79 | 80 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \ 81 | (self.column_bias.forward(samples)).view(1, -1) 82 | return self.criterion(tag_scores, y) 83 | 84 | def update_embedding_matrix(self): 85 | word_inp, chars_inp = [], [] 86 | if self.training: 87 | columns = torch.LongTensor(len(self.negative_samples) + 1) 88 | samples = self.negative_samples 89 | for i, word in enumerate(samples): 90 | columns[self.word_to_column[word]] = word 91 | columns[0] = 0 92 | else: 93 | columns = torch.LongTensor(len(self.all_word) + 1) 94 | samples = self.all_word 95 | for i, word in enumerate(samples): 96 | columns[self.all_word_to_column[word]] = word 97 | columns[0] = 0 98 | 99 | if self.use_cuda: 100 | columns = columns.cuda() 101 | self.embedding_matrix = self.column_emb.forward(columns).transpose(0, 1) 102 | 103 | def update_negative_samples(self, word_inp, chars_inp, mask): 104 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 105 | in_batch = set() 106 | for i in range(batch_size): 107 | for j in range(seq_len): 108 | if mask[i][j] == 0: 109 | continue 110 | word = word_inp[i][j].tolist() 111 | in_batch.add(word) 112 | for i in range(batch_size): 113 | for j in range(seq_len): 114 | if mask[i][j] == 0: 115 | continue 116 | word = word_inp[i][j].tolist() 117 | if word not in self.all_word_to_column: 118 | self.all_word.append(word) 119 | self.all_word_to_column[word] = len(self.all_word_to_column) 120 | 121 | if word not in self.word_to_column: 122 | if len(self.negative_samples) < self.n_samples: 123 | self.negative_samples.append(word) 124 | self.word_to_column[word] = len(self.word_to_column) 125 | else: 126 | while self.negative_samples[0] in in_batch: 127 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]] 128 | self.word_to_column[word] = self.word_to_column.pop(self.negative_samples[0]) 129 | self.negative_samples = self.negative_samples[1:] + [word] 130 | 131 | 132 | class CNNSoftmaxLayer(nn.Module): 133 | def __init__(self, token_embedder, output_dim, n_class, n_samples, corr_dim, use_cuda): 134 | super(CNNSoftmaxLayer, self).__init__() 135 | self.token_embedder = token_embedder 136 | self.n_samples = n_samples 137 | self.use_cuda = use_cuda 138 | self.criterion = nn.CrossEntropyLoss(size_average=False) 139 | self.negative_samples = [] 140 | self.word_to_column = {0: 0} 141 | 142 | self.all_word = [] 143 | self.all_word_to_column = {0: 0} 144 | 145 | self.M = nn.Parameter(torch.Tensor(output_dim, corr_dim)) 146 | stdv = 1. / math.sqrt(self.M.size(1)) 147 | self.M.data.uniform_(-stdv, stdv) 148 | 149 | self.corr = nn.Embedding(n_class, corr_dim) 150 | self.corr.weight.data.uniform_(-0.25, 0.25) 151 | 152 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1)) 153 | self.oov_column.data.uniform_(-0.25, 0.25) 154 | 155 | def forward(self, x, y): 156 | if self.training: 157 | for i in range(y.size(0)): 158 | y[i] = self.word_to_column.get(y[i].tolist()) 159 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0) 160 | for package in self.negative_samples: 161 | samples[self.word_to_column[package[0]]] = package[0] 162 | else: 163 | for i in range(y.size(0)): 164 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0) 165 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0) 166 | for package in self.all_word: 167 | samples[self.all_word_to_column[package[0]]] = package[0] 168 | 169 | if self.use_cuda: 170 | samples = samples.cuda() 171 | 172 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \ 173 | (x.matmul(self.M).matmul(self.corr.forward(samples).transpose(0, 1))).view(y.size(0), -1) 174 | return self.criterion(tag_scores, y) 175 | 176 | def update_embedding_matrix(self): 177 | batch_size = 2048 178 | word_inp, chars_inp = [], [] 179 | if self.training: 180 | sub_matrices = [self.oov_column] 181 | samples = self.negative_samples 182 | id2pack = {} 183 | for i, package in enumerate(samples): 184 | id2pack[self.word_to_column[package[0]]] = i 185 | else: 186 | sub_matrices = [self.oov_column] 187 | samples = self.all_word 188 | id2pack = {} 189 | for i, package in enumerate(samples): 190 | id2pack[self.all_word_to_column[package[0]]] = i 191 | 192 | for i in range(len(samples)): 193 | # [n_samples, 1], [n_samples, 1, x], [n_samples, 1] 194 | word_inp.append(samples[id2pack[i + 1]][0]) 195 | chars_inp.append(samples[id2pack[i + 1]][1]) 196 | if len(word_inp) == batch_size or i == len(samples) - 1: 197 | sub_matrices.append(self.token_embedder.forward(torch.LongTensor(word_inp).view(len(word_inp), 1), 198 | None if chars_inp[0] is None else torch.LongTensor(chars_inp).view(len(word_inp), 1, len(package[1])), 199 | (len(word_inp), 1)).squeeze(1).transpose(0, 1)) 200 | if not self.training: 201 | sub_matrices[-1] = sub_matrices[-1].detach() 202 | word_inp, chars_inp = [], [] 203 | 204 | sum = 0 205 | for mat in sub_matrices: 206 | sum += mat.size(1) 207 | #print(sum, len(self.word_to_column)) 208 | self.embedding_matrix = torch.cat(sub_matrices, dim=1) 209 | 210 | def update_negative_samples(self, word_inp, chars_inp, mask): 211 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 212 | in_batch = set() 213 | for i in range(batch_size): 214 | for j in range(seq_len): 215 | if mask[i][j] == 0: 216 | continue 217 | word = word_inp[i][j].tolist() 218 | in_batch.add(word) 219 | for i in range(batch_size): 220 | for j in range(seq_len): 221 | if mask[i][j] == 0: 222 | continue 223 | package = (word_inp[i][j].tolist(), None if chars_inp is None else chars_inp[i][j].tolist()) 224 | if package[0] not in self.all_word_to_column: 225 | self.all_word.append(package) 226 | self.all_word_to_column[package[0]] = len(self.all_word_to_column) 227 | 228 | if package[0] not in self.word_to_column: 229 | if len(self.negative_samples) < self.n_samples: 230 | self.negative_samples.append(package) 231 | self.word_to_column[package[0]] = len(self.word_to_column) 232 | else: 233 | while self.negative_samples[0][0] in in_batch: 234 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]] 235 | self.word_to_column[package[0]] = self.word_to_column.pop(self.negative_samples[0][0]) 236 | self.negative_samples = self.negative_samples[1:] + [package] 237 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/elmo.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, List, Callable, Union 2 | 3 | import h5py 4 | import numpy 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence 9 | from torch.autograd import Variable 10 | 11 | from .encoder_base import _EncoderBase 12 | from .lstm_cell_with_projection import LstmCellWithProjection 13 | 14 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name 15 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name 16 | 17 | 18 | class ElmobiLm(_EncoderBase): 19 | def __init__(self, config, use_cuda=False): 20 | super(ElmobiLm, self).__init__(stateful=True) 21 | self.config = config 22 | self.use_cuda = use_cuda 23 | input_size = config['encoder']['projection_dim'] 24 | hidden_size = config['encoder']['projection_dim'] 25 | cell_size = config['encoder']['dim'] 26 | num_layers = config['encoder']['n_layers'] 27 | memory_cell_clip_value = config['encoder']['cell_clip'] 28 | state_projection_clip_value = config['encoder']['proj_clip'] 29 | recurrent_dropout_probability = config['dropout'] 30 | 31 | self.input_size = input_size 32 | self.hidden_size = hidden_size 33 | self.num_layers = num_layers 34 | self.cell_size = cell_size 35 | 36 | forward_layers = [] 37 | backward_layers = [] 38 | 39 | lstm_input_size = input_size 40 | go_forward = True 41 | for layer_index in range(num_layers): 42 | forward_layer = LstmCellWithProjection(lstm_input_size, 43 | hidden_size, 44 | cell_size, 45 | go_forward, 46 | recurrent_dropout_probability, 47 | memory_cell_clip_value, 48 | state_projection_clip_value) 49 | backward_layer = LstmCellWithProjection(lstm_input_size, 50 | hidden_size, 51 | cell_size, 52 | not go_forward, 53 | recurrent_dropout_probability, 54 | memory_cell_clip_value, 55 | state_projection_clip_value) 56 | lstm_input_size = hidden_size 57 | 58 | self.add_module('forward_layer_{}'.format(layer_index), forward_layer) 59 | self.add_module('backward_layer_{}'.format(layer_index), backward_layer) 60 | forward_layers.append(forward_layer) 61 | backward_layers.append(backward_layer) 62 | self.forward_layers = forward_layers 63 | self.backward_layers = backward_layers 64 | 65 | def forward(self, inputs, mask): 66 | batch_size, total_sequence_length = mask.size() 67 | stacked_sequence_output, final_states, restoration_indices = \ 68 | self.sort_and_run_forward(self._lstm_forward, inputs, mask) 69 | 70 | num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size() 71 | # Add back invalid rows which were removed in the call to sort_and_run_forward. 72 | if num_valid < batch_size: 73 | zeros = stacked_sequence_output.data.new(num_layers, 74 | batch_size - num_valid, 75 | returned_timesteps, 76 | encoder_dim).fill_(0) 77 | zeros = Variable(zeros) 78 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1) 79 | 80 | # The states also need to have invalid rows added back. 81 | new_states = [] 82 | for state in final_states: 83 | state_dim = state.size(-1) 84 | zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0) 85 | zeros = Variable(zeros) 86 | new_states.append(torch.cat([state, zeros], 1)) 87 | final_states = new_states 88 | 89 | # It's possible to need to pass sequences which are padded to longer than the 90 | # max length of the sequence to a Seq2StackEncoder. However, packing and unpacking 91 | # the sequences mean that the returned tensor won't include these dimensions, because 92 | # the RNN did not need to process them. We add them back on in the form of zeros here. 93 | sequence_length_difference = total_sequence_length - returned_timesteps 94 | if sequence_length_difference > 0: 95 | zeros = stacked_sequence_output.data.new(num_layers, 96 | batch_size, 97 | sequence_length_difference, 98 | stacked_sequence_output[0].size(-1)).fill_(0) 99 | zeros = Variable(zeros) 100 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2) 101 | 102 | self._update_states(final_states, restoration_indices) 103 | 104 | # Restore the original indices and return the sequence. 105 | # Has shape (num_layers, batch_size, sequence_length, hidden_size) 106 | return stacked_sequence_output.index_select(1, restoration_indices) 107 | 108 | 109 | def _lstm_forward(self, 110 | inputs: PackedSequence, 111 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \ 112 | Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 113 | """ 114 | Parameters 115 | ---------- 116 | inputs : ``PackedSequence``, required. 117 | A batch first ``PackedSequence`` to run the stacked LSTM over. 118 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) 119 | A tuple (state, memory) representing the initial hidden state and memory 120 | of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and 121 | (num_layers, batch_size, 2 * cell_size) respectively. 122 | Returns 123 | ------- 124 | output_sequence : ``torch.FloatTensor`` 125 | The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size) 126 | final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` 127 | The per-layer final (state, memory) states of the LSTM, with shape 128 | (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) 129 | respectively. The last dimension is duplicated because it contains the state/memory 130 | for both the forward and backward layers. 131 | """ 132 | 133 | if initial_state is None: 134 | hidden_states: List[Optional[Tuple[torch.Tensor, 135 | torch.Tensor]]] = [None] * len(self.forward_layers) 136 | elif initial_state[0].size()[0] != len(self.forward_layers): 137 | raise Exception("Initial states were passed to forward() but the number of " 138 | "initial states does not match the number of layers.") 139 | else: 140 | hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) 141 | 142 | inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) 143 | forward_output_sequence = inputs 144 | backward_output_sequence = inputs 145 | 146 | final_states = [] 147 | sequence_outputs = [] 148 | for layer_index, state in enumerate(hidden_states): 149 | forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index)) 150 | backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index)) 151 | 152 | forward_cache = forward_output_sequence 153 | backward_cache = backward_output_sequence 154 | 155 | if state is not None: 156 | forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2) 157 | forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2) 158 | forward_state = (forward_hidden_state, forward_memory_state) 159 | backward_state = (backward_hidden_state, backward_memory_state) 160 | else: 161 | forward_state = None 162 | backward_state = None 163 | 164 | forward_output_sequence, forward_state = forward_layer(forward_output_sequence, 165 | batch_lengths, 166 | forward_state) 167 | backward_output_sequence, backward_state = backward_layer(backward_output_sequence, 168 | batch_lengths, 169 | backward_state) 170 | # Skip connections, just adding the input to the output. 171 | if layer_index != 0: 172 | forward_output_sequence += forward_cache 173 | backward_output_sequence += backward_cache 174 | 175 | sequence_outputs.append(torch.cat([forward_output_sequence, 176 | backward_output_sequence], -1)) 177 | # Append the state tuples in a list, so that we can return 178 | # the final states for all the layers. 179 | final_states.append((torch.cat([forward_state[0], backward_state[0]], -1), 180 | torch.cat([forward_state[1], backward_state[1]], -1))) 181 | 182 | stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs) 183 | # Stack the hidden state and memory for each layer into 2 tensors of shape 184 | # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size) 185 | # respectively. 186 | final_hidden_states, final_memory_states = zip(*final_states) 187 | final_state_tuple: Tuple[torch.FloatTensor, 188 | torch.FloatTensor] = (torch.cat(final_hidden_states, 0), 189 | torch.cat(final_memory_states, 0)) 190 | return stacked_sequence_outputs, final_state_tuple 191 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/embedding_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import logging 6 | 7 | logger = logging.getLogger('elmoformanylangs') 8 | 9 | 10 | class EmbeddingLayer(nn.Module): 11 | def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='', pad='', normalize=True): 12 | super(EmbeddingLayer, self).__init__() 13 | if embs is not None: 14 | embwords, embvecs = embs 15 | # for word in embwords: 16 | # assert word not in word2id, "Duplicate words in pre-trained embeddings" 17 | # word2id[word] = len(word2id) 18 | 19 | logger.info("{} pre-trained word embeddings loaded.".format(len(word2id))) 20 | if n_d != len(embvecs[0]): 21 | logger.warning("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.".format( 22 | n_d, len(embvecs[0]), len(embvecs[0]))) 23 | n_d = len(embvecs[0]) 24 | 25 | self.word2id = word2id 26 | self.id2word = {i: word for word, i in word2id.items()} 27 | self.n_V, self.n_d = len(word2id), n_d 28 | self.oovid = word2id[oov] 29 | self.padid = word2id[pad] 30 | self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid) 31 | self.embedding.weight.data.uniform_(-0.25, 0.25) 32 | 33 | if embs is not None: 34 | weight = self.embedding.weight 35 | weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs)) 36 | logger.info("embedding shape: {}".format(weight.size())) 37 | 38 | if normalize: 39 | weight = self.embedding.weight 40 | norms = weight.data.norm(2, 1) 41 | if norms.dim() == 1: 42 | norms = norms.unsqueeze(1) 43 | weight.data.div_(norms.expand_as(weight.data)) 44 | 45 | if fix_emb: 46 | self.embedding.weight.requires_grad = False 47 | 48 | def forward(self, input_): 49 | return self.embedding(input_) 50 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/highway.py: -------------------------------------------------------------------------------- 1 | """ 2 | A `Highway layer `_ that does a gated combination of a linear 3 | transformation and a non-linear transformation of its input. 4 | """ 5 | 6 | from typing import Callable 7 | 8 | import torch 9 | from overrides import overrides 10 | 11 | 12 | class Highway(torch.nn.Module): 13 | """ 14 | A `Highway layer `_ does a gated combination of a linear 15 | transformation and a non-linear transformation of its input. :math:`y = g * x + (1 - g) * 16 | f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise 17 | non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`. 18 | This module will apply a fixed number of highway layers to its input, returning the final 19 | result. 20 | Parameters 21 | ---------- 22 | input_dim : ``int`` 23 | The dimensionality of :math:`x`. We assume the input has shape ``(batch_size, 24 | input_dim)``. 25 | num_layers : ``int``, optional (default=``1``) 26 | The number of highway layers to apply to the input. 27 | activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``) 28 | The non-linearity to use in the highway layers. 29 | """ 30 | def __init__(self, 31 | input_dim: int, 32 | num_layers: int = 1, 33 | activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None: 34 | super(Highway, self).__init__() 35 | self._input_dim = input_dim 36 | self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2) 37 | for _ in range(num_layers)]) 38 | self._activation = activation 39 | for layer in self._layers: 40 | # We should bias the highway layer to just carry its input forward. We do that by 41 | # setting the bias on `B(x)` to be positive, because that means `g` will be biased to 42 | # be high, to we will carry the input forward. The bias on `B(x)` is the second half 43 | # of the bias vector in each Linear layer. 44 | layer.bias[input_dim:].data.fill_(1) 45 | 46 | @overrides 47 | def forward(self, inputs: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ 48 | current_input = inputs 49 | for layer in self._layers: 50 | projected_input = layer(current_input) 51 | linear_part = current_input 52 | # NOTE: if you modify this, think about whether you should modify the initialization 53 | # above, too. 54 | nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)] 55 | gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)] 56 | nonlinear_part = self._activation(nonlinear_part) 57 | gate = torch.sigmoid(gate) 58 | current_input = gate * linear_part + (1 - gate) * nonlinear_part 59 | return current_input 60 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import copy 8 | 9 | 10 | class LstmbiLm(nn.Module): 11 | def __init__(self, config, use_cuda=False): 12 | super(LstmbiLm, self).__init__() 13 | self.config = config 14 | self.use_cuda = use_cuda 15 | 16 | self.encoder = nn.LSTM(self.config['encoder']['projection_dim'], 17 | self.config['encoder']['dim'], 18 | num_layers=self.config['encoder']['n_layers'], 19 | bidirectional=True, 20 | batch_first=True, 21 | dropout=self.config['dropout']) 22 | self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True) 23 | 24 | def forward(self, inputs): 25 | forward, backward = self.encoder(inputs)[0].split(self.config['encoder']['dim'], 2) 26 | return torch.cat([self.projection(forward), self.projection(backward)], dim=2) 27 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/token_embedder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import copy 8 | from .highway import Highway 9 | 10 | 11 | class LstmTokenEmbedder(nn.Module): 12 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False): 13 | super(LstmTokenEmbedder, self).__init__() 14 | self.config = config 15 | self.use_cuda = use_cuda 16 | self.word_emb_layer = word_emb_layer 17 | self.char_emb_layer = char_emb_layer 18 | self.output_dim = config['encoder']['projection_dim'] 19 | emb_dim = 0 20 | if word_emb_layer is not None: 21 | emb_dim += word_emb_layer.n_d 22 | 23 | if char_emb_layer is not None: 24 | emb_dim += char_emb_layer.n_d * 2 25 | self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True, 26 | batch_first=True, dropout=config['dropout']) 27 | 28 | self.projection = nn.Linear(emb_dim, self.output_dim, bias=True) 29 | 30 | def forward(self, word_inp, chars_inp, shape): 31 | embs = [] 32 | batch_size, seq_len = shape 33 | if self.word_emb_layer is not None: 34 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp)) 35 | embs.append(word_emb) 36 | 37 | if self.char_emb_layer is not None: 38 | chars_inp = chars_inp.view(batch_size * seq_len, -1) 39 | chars_emb = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp)) 40 | _, (chars_outputs, __) = self.char_lstm(chars_emb) 41 | chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2) 42 | embs.append(chars_outputs) 43 | 44 | token_embedding = torch.cat(embs, dim=2) 45 | 46 | return self.projection(token_embedding) 47 | 48 | 49 | class ConvTokenEmbedder(nn.Module): 50 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda): 51 | super(ConvTokenEmbedder, self).__init__() 52 | self.config = config 53 | self.use_cuda = use_cuda 54 | 55 | self.word_emb_layer = word_emb_layer 56 | self.char_emb_layer = char_emb_layer 57 | 58 | self.output_dim = config['encoder']['projection_dim'] 59 | self.emb_dim = 0 60 | if word_emb_layer is not None: 61 | self.emb_dim += word_emb_layer.n_d 62 | 63 | if char_emb_layer is not None: 64 | self.convolutions = [] 65 | cnn_config = config['token_embedder'] 66 | filters = cnn_config['filters'] 67 | char_embed_dim = cnn_config['char_dim'] 68 | 69 | for i, (width, num) in enumerate(filters): 70 | conv = torch.nn.Conv1d( 71 | in_channels=char_embed_dim, 72 | out_channels=num, 73 | kernel_size=width, 74 | bias=True 75 | ) 76 | self.convolutions.append(conv) 77 | 78 | self.convolutions = nn.ModuleList(self.convolutions) 79 | 80 | self.n_filters = sum(f[1] for f in filters) 81 | self.n_highway = cnn_config['n_highway'] 82 | 83 | self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu) 84 | self.emb_dim += self.n_filters 85 | 86 | self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True) 87 | 88 | def forward(self, word_inp, chars_inp, shape): 89 | embs = [] 90 | batch_size, seq_len = shape 91 | if self.word_emb_layer is not None: 92 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 93 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp)) 94 | embs.append(word_emb) 95 | 96 | if self.char_emb_layer is not None: 97 | chars_inp = chars_inp.view(batch_size * seq_len, -1) 98 | 99 | character_embedding = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp)) 100 | 101 | character_embedding = torch.transpose(character_embedding, 1, 2) 102 | 103 | cnn_config = self.config['token_embedder'] 104 | if cnn_config['activation'] == 'tanh': 105 | activation = torch.nn.functional.tanh 106 | elif cnn_config['activation'] == 'relu': 107 | activation = torch.nn.functional.relu 108 | else: 109 | raise Exception("Unknown activation") 110 | 111 | convs = [] 112 | for i in range(len(self.convolutions)): 113 | convolved = self.convolutions[i](character_embedding) 114 | # (batch_size * sequence_length, n_filters for this width) 115 | convolved, _ = torch.max(convolved, dim=-1) 116 | convolved = activation(convolved) 117 | convs.append(convolved) 118 | char_emb = torch.cat(convs, dim=-1) 119 | char_emb = self.highways(char_emb) 120 | 121 | embs.append(char_emb.view(batch_size, -1, self.n_filters)) 122 | 123 | token_embedding = torch.cat(embs, dim=2) 124 | 125 | return self.projection(token_embedding) 126 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Assorted utilities for working with neural networks in AllenNLP. 3 | """ 4 | from collections import defaultdict 5 | from typing import Dict, List, Optional, Any, Tuple, Callable 6 | import itertools 7 | import math 8 | import torch 9 | from torch.autograd import Variable 10 | 11 | def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): 12 | """ 13 | Compute sequence lengths for each batch element in a tensor using a 14 | binary mask. 15 | Parameters 16 | ---------- 17 | mask : torch.Tensor, required. 18 | A 2D binary mask of shape (batch_size, sequence_length) to 19 | calculate the per-batch sequence lengths from. 20 | Returns 21 | ------- 22 | A torch.LongTensor of shape (batch_size,) representing the lengths 23 | of the sequences in the batch. 24 | """ 25 | return mask.long().sum(-1) 26 | 27 | 28 | def sort_batch_by_length(tensor: torch.autograd.Variable, 29 | sequence_lengths: torch.autograd.Variable): 30 | """ 31 | Sort a batch first tensor by some specified lengths. 32 | Parameters 33 | ---------- 34 | tensor : Variable(torch.FloatTensor), required. 35 | A batch first Pytorch tensor. 36 | sequence_lengths : Variable(torch.LongTensor), required. 37 | A tensor representing the lengths of some dimension of the tensor which 38 | we want to sort by. 39 | Returns 40 | ------- 41 | sorted_tensor : Variable(torch.FloatTensor) 42 | The original tensor sorted along the batch dimension with respect to sequence_lengths. 43 | sorted_sequence_lengths : Variable(torch.LongTensor) 44 | The original sequence_lengths sorted by decreasing size. 45 | restoration_indices : Variable(torch.LongTensor) 46 | Indices into the sorted_tensor such that 47 | ``sorted_tensor.index_select(0, restoration_indices) == original_tensor`` 48 | permuation_index : Variable(torch.LongTensor) 49 | The indices used to sort the tensor. This is useful if you want to sort many 50 | tensors using the same ordering. 51 | """ 52 | 53 | if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable): 54 | raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.") 55 | 56 | sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True) 57 | sorted_tensor = tensor.index_select(0, permutation_index) 58 | 59 | # This is ugly, but required - we are creating a new variable at runtime, so we 60 | # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and 61 | # refilling one of the inputs to the function. 62 | index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths))) 63 | # This is the equivalent of zipping with index, sorting by the original 64 | # sequence lengths and returning the now sorted indices. 65 | index_range = Variable(index_range.long()) 66 | _, reverse_mapping = permutation_index.sort(0, descending=False) 67 | restoration_indices = index_range.index_select(0, reverse_mapping) 68 | return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index 69 | 70 | 71 | def get_final_encoder_states(encoder_outputs: torch.Tensor, 72 | mask: torch.Tensor, 73 | bidirectional: bool = False) -> torch.Tensor: 74 | """ 75 | Given the output from a ``Seq2SeqEncoder``, with shape ``(batch_size, sequence_length, 76 | encoding_dim)``, this method returns the final hidden state for each element of the batch, 77 | giving a tensor of shape ``(batch_size, encoding_dim)``. This is not as simple as 78 | ``encoder_outputs[:, -1]``, because the sequences could have different lengths. We use the 79 | mask (which has shape ``(batch_size, sequence_length)``) to find the final state for each batch 80 | instance. 81 | Additionally, if ``bidirectional`` is ``True``, we will split the final dimension of the 82 | ``encoder_outputs`` into two and assume that the first half is for the forward direction of the 83 | encoder and the second half is for the backward direction. We will concatenate the last state 84 | for each encoder dimension, giving ``encoder_outputs[:, -1, :encoding_dim/2]`` concated with 85 | ``encoder_outputs[:, 0, encoding_dim/2:]``. 86 | """ 87 | # These are the indices of the last words in the sequences (i.e. length sans padding - 1). We 88 | # are assuming sequences are right padded. 89 | # Shape: (batch_size,) 90 | last_word_indices = mask.sum(1).long() - 1 91 | batch_size, _, encoder_output_dim = encoder_outputs.size() 92 | expanded_indices = last_word_indices.view(-1, 1, 1).expand(batch_size, 1, encoder_output_dim) 93 | # Shape: (batch_size, 1, encoder_output_dim) 94 | final_encoder_output = encoder_outputs.gather(1, expanded_indices) 95 | final_encoder_output = final_encoder_output.squeeze(1) # (batch_size, encoder_output_dim) 96 | if bidirectional: 97 | final_forward_output = final_encoder_output[:, :(encoder_output_dim // 2)] 98 | final_backward_output = encoder_outputs[:, 0, (encoder_output_dim // 2):] 99 | final_encoder_output = torch.cat([final_forward_output, final_backward_output], dim=-1) 100 | return final_encoder_output 101 | 102 | 103 | def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable): 104 | """ 105 | Computes and returns an element-wise dropout mask for a given tensor, where 106 | each element in the mask is dropped out with probability dropout_probability. 107 | Note that the mask is NOT applied to the tensor - the tensor is passed to retain 108 | the correct CUDA tensor type for the mask. 109 | Parameters 110 | ---------- 111 | dropout_probability : float, required. 112 | Probability of dropping a dimension of the input. 113 | tensor_for_masking : torch.Variable, required. 114 | Returns 115 | ------- 116 | A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability). 117 | This scaling ensures expected values and variances of the output of applying this mask 118 | and the original tensor are the same. 119 | """ 120 | binary_mask = tensor_for_masking.clone() 121 | binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability) 122 | # Scale mask by 1/keep_prob to preserve output statistics. 123 | dropout_mask = binary_mask.float().div(1.0 - dropout_probability) 124 | return dropout_mask 125 | 126 | def block_orthogonal(tensor: torch.Tensor, 127 | split_sizes: List[int], 128 | gain: float = 1.0) -> None: 129 | """ 130 | An initializer which allows initializing ptm parameters in "blocks". This is helpful 131 | in the case of recurrent models which use multiple gates applied to linear projections, 132 | which can be computed efficiently if they are concatenated together. However, they are 133 | separate parameters which should be initialized independently. 134 | Parameters 135 | ---------- 136 | tensor : ``torch.Tensor``, required. 137 | A tensor to initialize. 138 | split_sizes : List[int], required. 139 | A list of length ``tensor.ndim()`` specifying the size of the 140 | blocks along that particular dimension. E.g. ``[10, 20]`` would 141 | result in the tensor being split into chunks of size 10 along the 142 | first dimension and 20 along the second. 143 | gain : float, optional (default = 1.0) 144 | The gain (scaling) applied to the orthogonal initialization. 145 | """ 146 | 147 | if isinstance(tensor, Variable): 148 | # in pytorch 4.0, Variable equals Tensor 149 | # block_orthogonal(tensor.data, split_sizes, gain) 150 | #else: 151 | sizes = list(tensor.size()) 152 | if any([a % b != 0 for a, b in zip(sizes, split_sizes)]): 153 | raise ConfigurationError("tensor dimensions must be divisible by their respective " 154 | "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes)) 155 | indexes = [list(range(0, max_size, split)) 156 | for max_size, split in zip(sizes, split_sizes)] 157 | # Iterate over all possible blocks within the tensor. 158 | for block_start_indices in itertools.product(*indexes): 159 | # A list of tuples containing the index to start at for this block 160 | # and the appropriate step size (i.e split_size[i] for dimension i). 161 | index_and_step_tuples = zip(block_start_indices, split_sizes) 162 | # This is a tuple of slices corresponding to: 163 | # tensor[index: index + step_size, ...]. This is 164 | # required because we could have an arbitrary number 165 | # of dimensions. The actual slices we need are the 166 | # start_index: start_index + step for each dimension in the tensor. 167 | block_slice = tuple([slice(start_index, start_index + step) 168 | for start_index, step in index_and_step_tuples]) 169 | tensor[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain) 170 | -------------------------------------------------------------------------------- /elmoformanylangs/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | import collections 4 | import itertools 5 | 6 | 7 | def flatten(lst): 8 | return list(itertools.chain.from_iterable(lst)) 9 | 10 | 11 | def deep_iter(x): 12 | if isinstance(x, list) or isinstance(x, tuple): 13 | for u in x: 14 | for v in deep_iter(u): 15 | yield v 16 | else: 17 | yield 18 | 19 | 20 | def dict2namedtuple(dic): 21 | return collections.namedtuple('Namespace', dic.keys())(**dic) 22 | -------------------------------------------------------------------------------- /gpt/chat.py: -------------------------------------------------------------------------------- 1 | from gpt.chitchat.interact import chitchat 2 | 3 | while 1: 4 | text = input('text:') 5 | r = chitchat(text) 6 | print(r) 7 | -------------------------------------------------------------------------------- /gpt/chitchat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/gpt/chitchat/__init__.py -------------------------------------------------------------------------------- /gpt/chitchat/config/model_config_dialogue_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 1e-05, 4 | "n_ctx": 300, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 10, 8 | "n_positions": 300, 9 | "vocab_size": 13317 10 | } -------------------------------------------------------------------------------- /gpt/chitchat/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/gpt/chitchat/data/.gitkeep -------------------------------------------------------------------------------- /gpt/chitchat/dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import torch 3 | 4 | 5 | class MyDataset(Dataset): 6 | """ 7 | 8 | """ 9 | 10 | def __init__(self, data_list): 11 | self.data_list = data_list 12 | 13 | def __getitem__(self, index): 14 | input_ids = self.data_list[index].strip() 15 | input_ids = [int(token_id) for token_id in input_ids.split()] 16 | return input_ids 17 | 18 | def __len__(self): 19 | return len(self.data_list) 20 | -------------------------------------------------------------------------------- /gpt/chitchat/generate_dialogue_subset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from os.path import join 3 | import numpy as np 4 | from collections import Counter 5 | import matplotlib.pyplot as plt 6 | from matplotlib.pyplot import MultipleLocator 7 | 8 | 9 | def generate_subset(): 10 | """ 11 | 用于生成训练子集 12 | :return: 13 | """ 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料') 16 | parser.add_argument('--subset_size', default=500000, type=int, required=False, help='要获取的对话数据子集的规模') 17 | parser.add_argument('--subset_data_path', default='data', type=str, required=False, 18 | help='数据子集文件路径,指定文件的父目录') 19 | args = parser.parse_args() 20 | with open(args.raw_data_path, "r", encoding="utf8") as f: 21 | data = f.read() 22 | dialogues = data.split("\n\n") 23 | subset_size = min(len(dialogues), args.subset_size) 24 | 25 | with open(join(args.subset_data_path, "train_{}w.txt".format(int(subset_size / 10000))), "w", encoding="utf8") as f: 26 | print("generating subset,please wait a few seconds ") 27 | for dialogue_index, dialogue in enumerate(dialogues): 28 | if dialogue_index >= subset_size: 29 | break 30 | for utterance in dialogue.split("\n"): 31 | f.writelines(utterance + "\n") 32 | f.writelines("\n") 33 | 34 | 35 | def compute_dialogue_length(): 36 | """ 37 | 查看聊天语料中的dialogue的长度分布 38 | :return: 39 | """ 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料') 42 | args = parser.parse_args() 43 | with open(args.raw_data_path, "r", encoding="utf8") as f: 44 | data = f.read() 45 | dialogues = data.split("\n\n") 46 | # 统计各个dialogue的长度 47 | dialogues_lengths = [len(dialogue.replace("\n", "")) for dialogue in dialogues] 48 | counter = Counter(dialogues_lengths) # {label:sum(label)} 49 | dialogue_length_arr = list(counter) 50 | num_arr = [counter[element] for element in list(counter)] 51 | print(counter[300]) 52 | 53 | x_major_locator = MultipleLocator(100) # MultipleLocator用于设置刻度间隔 54 | # y_major_locator = MultipleLocator(20000) 55 | ax = plt.gca() # ax为两条坐标轴的实例 56 | ax.xaxis.set_major_locator(x_major_locator) # 把x轴的主刻度设置为10的倍数 57 | # ax.yaxis.set_major_locator(y_major_locator) 58 | 59 | plt.xlabel('dialogue length') 60 | plt.ylabel('number of dialogue') 61 | # plt.plot(dialogue_length_arr, num_arr, c='green') 62 | plt.scatter(dialogue_length_arr, num_arr) 63 | plt.show() 64 | 65 | 66 | if __name__ == '__main__': 67 | compute_dialogue_length() 68 | -------------------------------------------------------------------------------- /gpt/chitchat/interact.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | import torch 3 | import argparse 4 | import logging 5 | from transformers import GPT2Config, GPT2LMHeadModel 6 | from transformers import BertTokenizer 7 | import torch.nn.functional as F 8 | 9 | PAD = '[PAD]' 10 | pad_id = 0 11 | 12 | 13 | def set_interact_args(): 14 | """ 15 | Sets up the training arguments. 16 | """ 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--device', default='0', type=str, required=False, help='生成设备') 19 | parser.add_argument('--temperature', default=1, type=float, required=False, help='生成的temperature') 20 | parser.add_argument('--topk', default=8, type=int, required=False, help='最高k选1') 21 | parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') 22 | parser.add_argument('--model_config', default='chitchat/config/model_config_dialogue_small.json', type=str, required=False, 23 | help='模型参数') 24 | parser.add_argument('--log_path', default='chitchat/data/interacting.log', type=str, required=False, help='interact日志存放位置') 25 | parser.add_argument('--voca_path', default='chitchat/vocabulary/vocab_small.txt', type=str, required=False, help='选择词库') 26 | parser.add_argument('--dialogue_model_path', default='chitchat/dialogue_model/', type=str, required=False, help='对话模型路径') 27 | parser.add_argument('--save_samples_path', default="chitchat/sample/", type=str, required=False, help="保存聊天记录的文件路径") 28 | parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False, 29 | help="重复惩罚参数,若生成的对话重复性较高,可适当提高该参数") 30 | parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数,以使得训练的结果是确定的') 31 | parser.add_argument('--max_len', type=int, default=25, help='每个utterance的最大长度,超过指定长度则进行截断') 32 | parser.add_argument('--max_history_len', type=int, default=5, help="dialogue history的最大长度") 33 | parser.add_argument('--no_cuda', action='store_true', help='不使用GPU进行预测') 34 | return parser.parse_args() 35 | 36 | 37 | def create_logger(args): 38 | """ 39 | 将日志输出到日志文件和控制台 40 | """ 41 | logger = logging.getLogger(__name__) 42 | logger.setLevel(logging.INFO) 43 | 44 | formatter = logging.Formatter( 45 | '%(asctime)s - %(levelname)s - %(message)s') 46 | 47 | # 创建一个handler,用于写入日志文件 48 | file_handler = logging.FileHandler( 49 | filename=args.log_path) 50 | file_handler.setFormatter(formatter) 51 | file_handler.setLevel(logging.INFO) 52 | logger.addHandler(file_handler) 53 | 54 | # 创建一个handler,用于将日志输出到控制台 55 | console = logging.StreamHandler() 56 | console.setLevel(logging.DEBUG) 57 | console.setFormatter(formatter) 58 | logger.addHandler(console) 59 | 60 | return logger 61 | 62 | 63 | def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): 64 | """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering 65 | Args: 66 | logits: logits distribution shape (vocabulary size) 67 | top_k > 0: keep only top k tokens with highest probability (top-k filtering). 68 | top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). 69 | Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) 70 | From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 71 | """ 72 | assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear 73 | top_k = min(top_k, logits.size(-1)) # Safety check 74 | if top_k > 0: 75 | # Remove all tokens with a probability less than the last token of the top-k 76 | # torch.topk()返回最后一维最大的top_k个元素,返回值为二维(values,indices) 77 | # ...表示其他维度由计算机自行推断 78 | indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] 79 | logits[indices_to_remove] = filter_value # 对于topk之外的其他元素的logits值设为负无穷 80 | 81 | if top_p > 0.0: 82 | sorted_logits, sorted_indices = torch.sort(logits, descending=True) # 对logits进行递减排序 83 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 84 | 85 | # Remove tokens with cumulative probability above the threshold 86 | sorted_indices_to_remove = cumulative_probs > top_p 87 | # Shift the indices to the right to keep also the first token above the threshold 88 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() 89 | sorted_indices_to_remove[..., 0] = 0 90 | 91 | indices_to_remove = sorted_indices[sorted_indices_to_remove] 92 | logits[indices_to_remove] = filter_value 93 | return logits 94 | 95 | 96 | args = set_interact_args() 97 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 98 | tokenizer = BertTokenizer(vocab_file=args.voca_path) 99 | model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) 100 | model.to(device) 101 | model.eval() 102 | 103 | history = [] 104 | 105 | 106 | def chitchat(text): 107 | history.append(tokenizer.encode(text)) 108 | input_ids = [] # 每个input以[CLS]为开头 109 | 110 | for history_id, history_utr in enumerate(history[-args.max_history_len:]): 111 | input_ids.extend(history_utr) 112 | # input_ids.append(tokenizer.sep_token_id) 113 | curr_input_tensor = torch.tensor(input_ids).long().to(device) 114 | generated = [] 115 | # 最多生成max_len个token 116 | for _ in range(args.max_len): 117 | outputs = model(input_ids=curr_input_tensor) 118 | next_token_logits = outputs[0][-1, :] 119 | # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 120 | for id in set(generated): 121 | next_token_logits[id] /= args.repetition_penalty 122 | next_token_logits = next_token_logits / args.temperature 123 | # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token 124 | next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf') 125 | filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) 126 | # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 127 | next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) 128 | if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 129 | break 130 | generated.append(next_token.item()) 131 | curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0) 132 | # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist()) 133 | # print("his_text:{}".format(his_text)) 134 | history.append(generated) 135 | text = tokenizer.convert_ids_to_tokens(generated) 136 | return ''.join(text) 137 | 138 | 139 | if __name__ == '__main__': 140 | # main() 141 | while 1: 142 | text = input('user:') 143 | print(chitchat(text)) 144 | -------------------------------------------------------------------------------- /gpt/chitchat/interact_mmi.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | import torch 3 | import os 4 | import json 5 | import random 6 | import numpy as np 7 | import argparse 8 | from torch.utils.tensorboard import SummaryWriter 9 | from datetime import datetime 10 | from tqdm import tqdm 11 | from torch.nn import DataParallel 12 | import logging 13 | from transformers.modeling_gpt2 import GPT2Config, GPT2LMHeadModel 14 | from transformers import BertTokenizer 15 | from os.path import join, exists 16 | from itertools import zip_longest, chain 17 | from dataset import MyDataset 18 | from torch.utils.data import Dataset, DataLoader 19 | from torch.nn import CrossEntropyLoss 20 | from sklearn.model_selection import train_test_split 21 | from train import create_model 22 | import torch.nn.functional as F 23 | import copy 24 | 25 | PAD = '[PAD]' 26 | pad_id = 0 27 | 28 | 29 | def set_interact_args(): 30 | """ 31 | Sets up the training arguments. 32 | """ 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--device', default='0', type=str, required=False, help='生成设备') 35 | parser.add_argument('--temperature', default=1, type=float, required=False, help='生成的temperature') 36 | parser.add_argument('--topk', default=8, type=int, required=False, help='最高k选1') 37 | parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率') 38 | parser.add_argument('--model_config', default='config/model_config_dialogue_small.json', type=str, required=False, 39 | help='模型参数') 40 | parser.add_argument('--log_path', default='data/interacting_mmi.log', type=str, required=False, 41 | help='interact_mmi日志存放位置') 42 | parser.add_argument('--voca_path', default='vocabulary/vocab_small.txt', type=str, required=False, help='选择词库') 43 | parser.add_argument('--dialogue_model_path', default='dialogue_model/', type=str, required=False, 44 | help='dialogue_model路径') 45 | parser.add_argument('--mmi_model_path', default='mmi_model/', type=str, required=False, 46 | help='互信息mmi_model路径') 47 | parser.add_argument('--save_samples_path', default="sample/", type=str, required=False, help="保存聊天记录的文件路径") 48 | parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False, 49 | help="重复惩罚参数,若生成的对话重复性较高,可适当提高该参数") 50 | parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数,以使得训练的结果是确定的') 51 | parser.add_argument('--max_len', type=int, default=25, help='每个utterance的最大长度,超过指定长度则进行截断') 52 | parser.add_argument('--max_history_len', type=int, default=5, help="dialogue history的最大长度") 53 | parser.add_argument('--no_cuda', action='store_true', help='不使用GPU进行预测') 54 | parser.add_argument('--batch_size', type=int, default=5, help='批量生成response,然后经过MMI模型进行筛选') 55 | parser.add_argument('--debug', action='store_true', help='指定该参数,可以查看生成的所有候选的reponse,及其loss') 56 | return parser.parse_args() 57 | 58 | 59 | def create_logger(args): 60 | """ 61 | 将日志输出到日志文件和控制台 62 | """ 63 | logger = logging.getLogger(__name__) 64 | logger.setLevel(logging.INFO) 65 | 66 | formatter = logging.Formatter( 67 | '%(asctime)s - %(levelname)s - %(message)s') 68 | 69 | # 创建一个handler,用于写入日志文件 70 | file_handler = logging.FileHandler( 71 | filename=args.log_path) 72 | file_handler.setFormatter(formatter) 73 | file_handler.setLevel(logging.INFO) 74 | logger.addHandler(file_handler) 75 | 76 | # 创建一个handler,用于将日志输出到控制台 77 | console = logging.StreamHandler() 78 | console.setLevel(logging.DEBUG) 79 | console.setFormatter(formatter) 80 | logger.addHandler(console) 81 | 82 | return logger 83 | 84 | 85 | def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): 86 | """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering 87 | Args: 88 | logits: logits distribution shape (vocabulary size) 89 | top_k > 0: keep only top k tokens with highest probability (top-k filtering). 90 | top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). 91 | Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) 92 | """ 93 | assert logits.dim() == 2 94 | top_k = min(top_k, logits[0].size(-1)) # Safety check 95 | if top_k > 0: 96 | # Remove all tokens with a probability less than the last token of the top-k 97 | # torch.topk()返回最后一维最大的top_k个元素,返回值为二维(values,indices) 98 | # ...表示其他维度由计算机自行推断 99 | for logit in logits: 100 | indices_to_remove = logit < torch.topk(logit, top_k)[0][..., -1, None] 101 | logit[indices_to_remove] = filter_value # 对于topk之外的其他元素的logits值设为负无穷 102 | 103 | if top_p > 0.0: 104 | sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) # 对logits进行递减排序 105 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 106 | 107 | # Remove tokens with cumulative probability above the threshold 108 | sorted_indices_to_remove = cumulative_probs > top_p 109 | # Shift the indices to the right to keep also the first token above the threshold 110 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() 111 | sorted_indices_to_remove[..., 0] = 0 112 | for index, logit in enumerate(logits): 113 | indices_to_remove = sorted_indices[index][sorted_indices_to_remove[index]] 114 | logit[indices_to_remove] = filter_value 115 | return logits 116 | 117 | 118 | def main(): 119 | args = set_interact_args() 120 | logger = create_logger(args) 121 | # 当用户使用GPU,并且GPU可用时 122 | args.cuda = torch.cuda.is_available() and not args.no_cuda 123 | device = 'cuda' if args.cuda else 'cpu' 124 | logger.info('using device:{}'.format(device)) 125 | os.environ["CUDA_VISIBLE_DEVICES"] = args.device 126 | tokenizer = BertTokenizer(vocab_file=args.voca_path) 127 | # 对话model 128 | dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path) 129 | dialogue_model.to(device) 130 | dialogue_model.eval() 131 | # 互信息mmi model 132 | mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path) 133 | mmi_model.to(device) 134 | mmi_model.eval() 135 | if args.save_samples_path: 136 | if not os.path.exists(args.save_samples_path): 137 | os.makedirs(args.save_samples_path) 138 | samples_file = open(args.save_samples_path + '/mmi_samples.txt', 'a', encoding='utf8') 139 | samples_file.write("聊天记录{}:\n".format(datetime.now())) 140 | # 存储聊天记录,每个utterance以token的id的形式进行存储 141 | history = [] 142 | print('开始和chatbot聊天,输入CTRL + Z以退出') 143 | 144 | while True: 145 | try: 146 | text = input("user:") 147 | if args.save_samples_path: 148 | samples_file.write("user:{}\n".format(text)) 149 | history.append(tokenizer.encode(text)) 150 | input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 151 | for history_id, history_utr in enumerate(history[-args.max_history_len:]): 152 | input_ids.extend(history_utr) 153 | input_ids.append(tokenizer.sep_token_id) 154 | # 用于批量生成response,维度为(batch_size,token_len) 155 | input_ids = [copy.deepcopy(input_ids) for _ in range(args.batch_size)] 156 | 157 | curr_input_tensors = torch.tensor(input_ids).long().to(device) 158 | generated = [] # 二维数组,维度为(生成的response的最大长度,batch_size),generated[i,j]表示第j个response的第i个token的id 159 | finish_set = set() # 标记是否所有response均已生成结束,若第i个response生成结束,即生成了sep_token_id,则将i放入finish_set 160 | # 最多生成max_len个token 161 | for _ in range(args.max_len): 162 | outputs = dialogue_model(input_ids=curr_input_tensors) 163 | next_token_logits = outputs[0][:, -1, :] 164 | # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 165 | for index in range(args.batch_size): 166 | for token_id in set([token_ids[index] for token_ids in generated]): 167 | next_token_logits[index][token_id] /= args.repetition_penalty 168 | next_token_logits = next_token_logits / args.temperature 169 | # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token 170 | for next_token_logit in next_token_logits: 171 | next_token_logit[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf') 172 | filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) 173 | # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 174 | next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) 175 | # 判断是否有response生成了[SEP],将已生成了[SEP]的resposne进行标记 176 | for index, token_id in enumerate(next_token[:, 0]): 177 | if token_id == tokenizer.sep_token_id: 178 | finish_set.add(index) 179 | # 检验是否所有的response均已生成[SEP] 180 | finish_flag = True # 是否所有的response均已生成[SEP]的token 181 | for index in range(args.batch_size): 182 | if index not in finish_set: # response批量生成未完成 183 | finish_flag = False 184 | break 185 | if finish_flag: 186 | break 187 | generated.append([token.item() for token in next_token[:, 0]]) 188 | # 将新生成的token与原来的token进行拼接 189 | curr_input_tensors = torch.cat((curr_input_tensors, next_token), dim=-1) 190 | candidate_responses = [] # 生成的所有候选response 191 | for batch_index in range(args.batch_size): 192 | response = [] 193 | for token_index in range(len(generated)): 194 | if generated[token_index][batch_index] != tokenizer.sep_token_id: 195 | response.append(generated[token_index][batch_index]) 196 | else: 197 | break 198 | candidate_responses.append(response) 199 | 200 | # mmi模型的输入 201 | if args.debug: 202 | print("candidate response:") 203 | samples_file.write("candidate response:\n") 204 | min_loss = float('Inf') 205 | best_response = "" 206 | for response in candidate_responses: 207 | mmi_input_id = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 208 | mmi_input_id.extend(response) 209 | mmi_input_id.append(tokenizer.sep_token_id) 210 | for history_utr in reversed(history[-args.max_history_len:]): 211 | mmi_input_id.extend(history_utr) 212 | mmi_input_id.append(tokenizer.sep_token_id) 213 | mmi_input_tensor = torch.tensor(mmi_input_id).long().to(device) 214 | out = mmi_model(input_ids=mmi_input_tensor, labels=mmi_input_tensor) 215 | loss = out[0].item() 216 | if args.debug: 217 | text = tokenizer.convert_ids_to_tokens(response) 218 | print("{} loss:{}".format("".join(text), loss)) 219 | samples_file.write("{} loss:{}\n".format("".join(text), loss)) 220 | if loss < min_loss: 221 | best_response = response 222 | min_loss = loss 223 | history.append(best_response) 224 | text = tokenizer.convert_ids_to_tokens(best_response) 225 | print("chatbot:" + "".join(text)) 226 | if args.save_samples_path: 227 | samples_file.write("chatbot:{}\n".format("".join(text))) 228 | except KeyboardInterrupt: 229 | if args.save_samples_path: 230 | samples_file.close() 231 | break 232 | 233 | 234 | if __name__ == '__main__': 235 | main() 236 | -------------------------------------------------------------------------------- /gpt/gpt_lyric.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline 2 | 3 | tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-lyric") 4 | model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-lyric") 5 | 6 | text_generator = TextGenerationPipeline(model, tokenizer) 7 | res = text_generator("最美的不是下雨天", max_length=100, do_sample=True) 8 | print(res) 9 | -------------------------------------------------------------------------------- /pseudo/first_stage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import jieba 4 | import numpy as np 5 | from collections import defaultdict 6 | from torch.utils.data import DataLoader, TensorDataset 7 | from sklearn.metrics import accuracy_score 8 | from utils import fix_seed 9 | import pandas as pd 10 | 11 | 12 | class TextCLS(torch.nn.Module): 13 | # 准备我们需要用到的参数和layer 14 | def __init__(self, 15 | vocab_size, 16 | embedding_size): 17 | super().__init__() 18 | self.embedding = nn.Embedding(vocab_size, embedding_size) 19 | # [batch_size, seq_len, hidden_size] 20 | self.lstm = nn.LSTM(input_size=embedding_size, 21 | hidden_size=256, 22 | num_layers=2, 23 | batch_first=True) 24 | self.dense1 = nn.Linear(256, 100) 25 | self.dense2 = nn.Linear(100, 5) 26 | 27 | # 前向传播,那我们准备好的layer拼接在一起 28 | def forward(self, x): 29 | embedding = self.embedding(x) 30 | # [batch_size, seq_len, hidden_size] 31 | out, _ = self.lstm(embedding) 32 | out = self.dense1(out[:, -1, :]) 33 | out = self.dense2(out) 34 | return out 35 | 36 | 37 | def tokenize(string): 38 | res = list(jieba.cut(string, cut_all=False)) 39 | return res 40 | 41 | 42 | # 把数据转换成index 43 | def seq2index(seq, vocab): 44 | seg = tokenize(seq) 45 | seg_index = [] 46 | for s in seg: 47 | seg_index.append(vocab.get(s, 1)) 48 | return seg_index 49 | 50 | 51 | # 统一长度 52 | def padding_seq(X, max_len=10): 53 | return np.array([ 54 | np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X 55 | ]) 56 | 57 | 58 | def load_data(batch_size=32): 59 | df = pd.read_csv('../data/tnews_public/train.csv') 60 | train_text = df['text'].values 61 | train_label = df['label'].values 62 | 63 | df = pd.read_csv('../data/tnews_public/dev.csv') 64 | dev_text = df['text'].values 65 | dev_label = df['label'].values 66 | 67 | # 生成词典 68 | segment = [tokenize(t) for t in train_text] 69 | 70 | word_frequency = defaultdict(int) 71 | for row in segment: 72 | for i in row: 73 | word_frequency[i] += 1 74 | 75 | word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True) # 根据词频降序排序 76 | 77 | vocab = {'[PAD]': 0, '[UNK]': 1} 78 | for d in word_sort: 79 | vocab[d[0]] = len(vocab) 80 | 81 | train_x = padding_seq([seq2index(t, vocab) for t in train_text]) 82 | train_y = np.array(train_label) 83 | train_data_set = TensorDataset(torch.from_numpy(train_x), 84 | torch.from_numpy(train_y)) 85 | train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size) 86 | 87 | dev_x = padding_seq([seq2index(t, vocab) for t in dev_text]) 88 | dev_y = np.array(dev_label) 89 | dev_data_set = TensorDataset(torch.from_numpy(dev_x), 90 | torch.from_numpy(dev_y)) 91 | dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size) 92 | 93 | return train_data_loader, dev_data_loader, vocab 94 | 95 | 96 | def pseudo_data(model, data): 97 | pseudo = [] 98 | pseudo_label = [] 99 | for step, (b_x, b_y) in enumerate(data): 100 | if torch.cuda.is_available(): 101 | b_x = b_x.cuda().long() 102 | with torch.no_grad(): 103 | # logits 104 | output = model(b_x) 105 | pred = torch.argmax(output, dim=1) 106 | # 拿到对应的置信度 107 | out = torch.softmax(output, dim=1) 108 | 109 | for i, (p, o) in enumerate(zip(pred, out)): 110 | if o[p] > 0.95: 111 | index = step * 128 + i 112 | pseudo.append(index) 113 | pseudo_label.append(p.item()) 114 | df = pd.read_csv('../data/tnews_public/dev.csv') 115 | dev_text = df['text'].values 116 | pseudo_text = dev_text[pseudo] 117 | df = pd.DataFrame({'text': pseudo_text, 'label': pseudo_label}) 118 | df.to_csv('pseudo.csv', index=False, encoding='utf_8_sig') 119 | 120 | 121 | # 训练模型 122 | def train(): 123 | fix_seed() 124 | 125 | train_data_loader, dev_data_loader, vocab = load_data(128) 126 | model = TextCLS(vocab_size=len(vocab), 127 | embedding_size=100) 128 | 129 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 130 | loss_func = nn.CrossEntropyLoss() 131 | 132 | if torch.cuda.is_available(): 133 | model = model.cuda() 134 | 135 | for epoch in range(5): 136 | print('epoch:', epoch + 1) 137 | pred = [] 138 | label = [] 139 | for step, (b_x, b_y) in enumerate(train_data_loader): 140 | if torch.cuda.is_available(): 141 | b_x = b_x.cuda().long() 142 | b_y = b_y.cuda().long() 143 | output = model(b_x) 144 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 145 | label.extend(b_y.cpu().numpy()) 146 | loss = loss_func(output, b_y) 147 | optimizer.zero_grad() 148 | # 求解梯度 149 | loss.backward() 150 | # 更新我们的权重 151 | optimizer.step() 152 | acc = accuracy_score(pred, label) 153 | print('train acc:', acc) 154 | 155 | pred = [] 156 | label = [] 157 | for step, (b_x, b_y) in enumerate(dev_data_loader): 158 | if torch.cuda.is_available(): 159 | b_x = b_x.cuda().long() 160 | b_y = b_y.cuda().long() 161 | with torch.no_grad(): 162 | output = model(b_x) 163 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 164 | label.extend(b_y.cpu().numpy()) 165 | acc = accuracy_score(pred, label) 166 | print('dev acc:', acc) 167 | print() 168 | 169 | pseudo_data(model, dev_data_loader) 170 | 171 | 172 | if __name__ == '__main__': 173 | train() 174 | -------------------------------------------------------------------------------- /pseudo/second_stage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import jieba 4 | import numpy as np 5 | from collections import defaultdict 6 | from torch.utils.data import DataLoader, TensorDataset 7 | from sklearn.metrics import accuracy_score 8 | from utils import fix_seed 9 | import pandas as pd 10 | 11 | 12 | class TextCLS(torch.nn.Module): 13 | # 准备我们需要用到的参数和layer 14 | def __init__(self, 15 | vocab_size, 16 | embedding_size): 17 | super().__init__() 18 | self.embedding = nn.Embedding(vocab_size, embedding_size) 19 | # [batch_size, seq_len, hidden_size] 20 | self.lstm = nn.LSTM(input_size=embedding_size, 21 | hidden_size=256, 22 | num_layers=2, 23 | batch_first=True) 24 | self.dense1 = nn.Linear(256, 100) 25 | self.dense2 = nn.Linear(100, 5) 26 | 27 | # 前向传播,那我们准备好的layer拼接在一起 28 | def forward(self, x): 29 | embedding = self.embedding(x) 30 | # [batch_size, seq_len, hidden_size] 31 | out, _ = self.lstm(embedding) 32 | out = self.dense1(out[:, -1, :]) 33 | out = self.dense2(out) 34 | return out 35 | 36 | 37 | def tokenize(string): 38 | res = list(jieba.cut(string, cut_all=False)) 39 | return res 40 | 41 | 42 | # 把数据转换成index 43 | def seq2index(seq, vocab): 44 | seg = tokenize(seq) 45 | seg_index = [] 46 | for s in seg: 47 | seg_index.append(vocab.get(s, 1)) 48 | return seg_index 49 | 50 | 51 | # 统一长度 52 | def padding_seq(X, max_len=10): 53 | return np.array([ 54 | np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X 55 | ]) 56 | 57 | 58 | def load_data(batch_size=32): 59 | df = pd.read_csv('../data/tnews_public/train.csv') 60 | train_text = df['text'].values 61 | train_label = df['label'].values 62 | 63 | df = pd.read_csv('../data/tnews_public/dev.csv') 64 | df2 = pd.read_csv('pseudo.csv') 65 | df = df.append(df2) 66 | dev_text = df['text'].values 67 | dev_label = df['label'].values 68 | 69 | # 生成词典 70 | segment = [tokenize(t) for t in train_text] 71 | 72 | word_frequency = defaultdict(int) 73 | for row in segment: 74 | for i in row: 75 | word_frequency[i] += 1 76 | 77 | word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True) # 根据词频降序排序 78 | 79 | vocab = {'[PAD]': 0, '[UNK]': 1} 80 | for d in word_sort: 81 | vocab[d[0]] = len(vocab) 82 | 83 | train_x = padding_seq([seq2index(t, vocab) for t in train_text]) 84 | train_y = np.array(train_label) 85 | train_data_set = TensorDataset(torch.from_numpy(train_x), 86 | torch.from_numpy(train_y)) 87 | train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size) 88 | 89 | dev_x = padding_seq([seq2index(t, vocab) for t in dev_text]) 90 | dev_y = np.array(dev_label) 91 | dev_data_set = TensorDataset(torch.from_numpy(dev_x), 92 | torch.from_numpy(dev_y)) 93 | dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size) 94 | 95 | return train_data_loader, dev_data_loader, vocab 96 | 97 | 98 | def pseudo_data(model, data): 99 | pseudo = [] 100 | pseudo_label = [] 101 | for step, (b_x, b_y) in enumerate(data): 102 | if torch.cuda.is_available(): 103 | b_x = b_x.cuda().long() 104 | with torch.no_grad(): 105 | output = model(b_x) 106 | pred = torch.argmax(output, dim=1) 107 | out = torch.softmax(output, dim=1) 108 | for i, (p, o) in enumerate(zip(pred, out)): 109 | if o[p] > 0.95: 110 | index = step * 128 + i 111 | pseudo.append(index) 112 | pseudo_label.append(b_y[i]) 113 | df = pd.read_csv('../data/tnews_public/dev.csv') 114 | dev_text = df['text'].values 115 | pseudo_text = dev_text[pseudo] 116 | df = pd.DataFrame({'text': pseudo_text, 'label': pseudo_label}) 117 | df.to_csv('pseudo.csv', index=False, encoding='utf_8_sig') 118 | 119 | 120 | # 训练模型 121 | def train(): 122 | fix_seed() 123 | 124 | train_data_loader, dev_data_loader, vocab = load_data(128) 125 | model = TextCLS(vocab_size=len(vocab), 126 | embedding_size=100) 127 | 128 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 129 | loss_func = nn.CrossEntropyLoss() 130 | 131 | if torch.cuda.is_available(): 132 | model = model.cuda() 133 | 134 | for epoch in range(5): 135 | print('epoch:', epoch + 1) 136 | pred = [] 137 | label = [] 138 | for step, (b_x, b_y) in enumerate(train_data_loader): 139 | if torch.cuda.is_available(): 140 | b_x = b_x.cuda().long() 141 | b_y = b_y.cuda().long() 142 | output = model(b_x) 143 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 144 | label.extend(b_y.cpu().numpy()) 145 | loss = loss_func(output, b_y) 146 | optimizer.zero_grad() 147 | # 求解梯度 148 | loss.backward() 149 | # 更新我们的权重 150 | optimizer.step() 151 | acc = accuracy_score(pred, label) 152 | print('train acc:', acc) 153 | 154 | pred = [] 155 | label = [] 156 | for step, (b_x, b_y) in enumerate(dev_data_loader): 157 | if torch.cuda.is_available(): 158 | b_x = b_x.cuda().long() 159 | b_y = b_y.cuda().long() 160 | with torch.no_grad(): 161 | output = model(b_x) 162 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 163 | label.extend(b_y.cpu().numpy()) 164 | acc = accuracy_score(pred, label) 165 | print('dev acc:', acc) 166 | print() 167 | 168 | 169 | if __name__ == '__main__': 170 | train() 171 | -------------------------------------------------------------------------------- /ptm/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/ptm/.gitkeep -------------------------------------------------------------------------------- /ptm/post train_bert.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, BertForMaskedLM 2 | from torch.utils.data import DataLoader, Dataset 3 | from tqdm import tqdm 4 | from utils import fix_seed 5 | from annlp import ptm_path 6 | import torch 7 | from utils import random_mask 8 | import pandas as pd 9 | 10 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 11 | path = ptm_path('roberta') 12 | tokenizer = BertTokenizer.from_pretrained(path) 13 | 14 | 15 | class BaseDataset(Dataset): 16 | def __init__(self, encodings, labels=None): 17 | self.encodings = encodings 18 | self.labels = labels 19 | 20 | def __getitem__(self, idx): 21 | item = {key: val[idx] for key, val in self.encodings.items()} 22 | return item 23 | 24 | def __len__(self): 25 | return len(self.encodings['source']) 26 | 27 | 28 | def load_data(file_name, batch_size): 29 | df = pd.read_csv(file_name) 30 | encoding = tokenizer(text=df['text'].tolist(), 31 | return_tensors='np', 32 | truncation=True, 33 | padding='max_length', 34 | max_length=10) 35 | sources = [] 36 | targets = [] 37 | for input_ids in encoding['input_ids']: 38 | source, target = random_mask(input_ids, tokenizer) 39 | sources.append(source) 40 | targets.append(target) 41 | 42 | data = {'source': torch.Tensor(sources), 43 | 'attention_mask': encoding['attention_mask'], 44 | 'target': torch.Tensor(targets)} 45 | data_loader = DataLoader(BaseDataset(data), 46 | batch_size, 47 | pin_memory=True if torch.cuda.is_available() else False, 48 | shuffle=False) 49 | return data_loader 50 | 51 | 52 | # 训练模型 53 | def train(): 54 | fix_seed() 55 | 56 | train_data_loader = load_data('../data/tnews_public/train.csv', batch_size=32) 57 | dev_data_loader = load_data('../data/tnews_public/dev.csv', batch_size=32) 58 | 59 | model = BertForMaskedLM.from_pretrained(path) 60 | model = model.to(device) 61 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-6) 62 | 63 | min_loss = 1000 64 | for epoch in range(5): 65 | print('epoch:', epoch + 1) 66 | pbar = tqdm(train_data_loader) 67 | for data in pbar: 68 | optimizer.zero_grad() 69 | 70 | input_ids = data['source'].to(device) 71 | attention_mask = data['attention_mask'].to(device) 72 | labels = data['target'].to(device).long() 73 | outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels) 74 | loss = outputs.loss 75 | loss.backward() 76 | optimizer.step() 77 | 78 | pbar.update() 79 | pbar.set_description(f'loss:{loss.item():.4f}') 80 | 81 | dev_loss = 0 82 | for data in tqdm(dev_data_loader): 83 | input_ids = data['source'].to(device) 84 | attention_mask = data['attention_mask'].to(device) 85 | labels = data['target'].to(device).long() 86 | with torch.no_grad(): 87 | outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels) 88 | dev_loss += outputs.loss.item() 89 | print('dev loss:', dev_loss / len(dev_data_loader)) 90 | print() 91 | 92 | if min_loss > dev_loss: 93 | min_loss = dev_loss 94 | torch.save(model, 'model.bin') 95 | 96 | 97 | if __name__ == '__main__': 98 | train() 99 | -------------------------------------------------------------------------------- /ptm/post train_gpt.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, GPT2LMHeadModel 2 | from torch.utils.data import DataLoader, Dataset 3 | from tqdm import tqdm 4 | from utils import fix_seed 5 | import torch 6 | import pandas as pd 7 | 8 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 9 | path = 'E:\\ptm\\gpt' 10 | tokenizer = BertTokenizer.from_pretrained(path) 11 | 12 | 13 | class BaseDataset(Dataset): 14 | def __init__(self, encodings, labels=None): 15 | self.encodings = encodings 16 | self.labels = labels 17 | 18 | def __getitem__(self, idx): 19 | item = {key: val[idx] for key, val in self.encodings.items()} 20 | return item 21 | 22 | def __len__(self): 23 | return len(self.encodings['source']) 24 | 25 | 26 | def load_data(file_name, batch_size): 27 | df = pd.read_csv(file_name) 28 | encoding = tokenizer(text=df['text'].tolist(), 29 | return_tensors='np', 30 | truncation=True, 31 | padding='max_length', 32 | max_length=10) 33 | sources = [] 34 | targets = [] 35 | for input_ids in encoding['input_ids']: 36 | sources.append(input_ids[0:-1]) 37 | targets.append(input_ids[1:]) 38 | 39 | # [101, 1, 2, 3, 102] 40 | # source:[101,1,2,3] 41 | # target:[1,2,3,102] 42 | 43 | data = {'source': torch.Tensor(sources), 44 | 'attention_mask': torch.Tensor([mask[:-1] for mask in encoding['attention_mask']]), 45 | 'target': torch.Tensor(targets)} 46 | data_loader = DataLoader(BaseDataset(data), 47 | batch_size, 48 | pin_memory=True if torch.cuda.is_available() else False, 49 | shuffle=False) 50 | return data_loader 51 | 52 | 53 | # 训练模型 54 | def train(): 55 | fix_seed() 56 | 57 | train_data_loader = load_data('../data/tnews_public/train.csv', batch_size=32) 58 | dev_data_loader = load_data('../data/tnews_public/dev.csv', batch_size=32) 59 | 60 | model = GPT2LMHeadModel.from_pretrained(path) 61 | model = model.to(device) 62 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-6) 63 | 64 | for epoch in range(5): 65 | print('epoch:', epoch + 1) 66 | pbar = tqdm(train_data_loader) 67 | for data in pbar: 68 | optimizer.zero_grad() 69 | 70 | input_ids = data['source'].to(device) 71 | attention_mask = data['attention_mask'].to(device) 72 | labels = data['target'].to(device).long() 73 | outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels) 74 | loss = outputs.loss 75 | loss.backward() 76 | optimizer.step() 77 | 78 | pbar.update() 79 | pbar.set_description(f'loss:{loss.item():.4f}') 80 | 81 | dev_loss = 0 82 | for data in tqdm(dev_data_loader): 83 | input_ids = data['source'].to(device) 84 | attention_mask = data['attention_mask'].to(device) 85 | labels = data['target'].to(device).long() 86 | with torch.no_grad(): 87 | outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels) 88 | dev_loss += outputs.loss.item() 89 | print('dev loss:', dev_loss / len(dev_data_loader)) 90 | print() 91 | 92 | 93 | if __name__ == '__main__': 94 | train() 95 | -------------------------------------------------------------------------------- /ptm/train_bert.py: -------------------------------------------------------------------------------- 1 | from transformers import BertForMaskedLM, BertTokenizer, \ 2 | BertPreTrainedModel, BertForSequenceClassification, BertModel 3 | import torch 4 | import pandas as pd 5 | from utils import random_mask 6 | from torch.utils.data import DataLoader, Dataset 7 | from tqdm import tqdm 8 | 9 | path = 'E:\\ptm\\roberta' 10 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 11 | model = BertForMaskedLM.from_pretrained(path) 12 | model = model.to(device) 13 | tokenizer = BertTokenizer.from_pretrained(path) 14 | 15 | 16 | class BaseDataSet(Dataset): 17 | def __init__(self, encoding): 18 | self.encoding = encoding 19 | 20 | def __len__(self): 21 | return len(self.encoding['source']) 22 | 23 | def __getitem__(self, ids): 24 | item = {k: v[ids] for k, v in self.encoding.items()} 25 | return item 26 | 27 | 28 | def load_data(file_name, batch_size): 29 | df = pd.read_csv(file_name) 30 | encoding = tokenizer(df['text'].tolist(), 31 | return_tensors='np', 32 | truncation=True, 33 | padding='max_length', 34 | max_length=10) 35 | sources = [] 36 | targets = [] 37 | for input_ids in encoding['input_ids']: 38 | source, target = random_mask(input_ids, tokenizer) 39 | sources.append(source) 40 | targets.append(target) 41 | data = {'source': torch.Tensor(sources), 42 | 'attention_mask': encoding['attention_mask'], 43 | 'target': torch.Tensor(targets)} 44 | 45 | data_loader = DataLoader(BaseDataSet(data), batch_size=batch_size) 46 | return data_loader 47 | 48 | 49 | def train(): 50 | bs = 32 51 | train_data = load_data('../data/tnews_public/train.csv', batch_size=bs) 52 | dev_data = load_data('../data/tnews_public/dev.csv', batch_size=bs) 53 | 54 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-6) 55 | 56 | for epoch in range(3): 57 | pbar = tqdm(train_data) 58 | for data in pbar: 59 | optimizer.zero_grad() 60 | 61 | input_ids = data['source'].to(device) 62 | attention_mask = data['attention_mask'].to(device) 63 | labels = data['target'].to(device) 64 | 65 | outputs = model(input_ids.long(), attention_mask, labels=labels.long()) 66 | 67 | loss = outputs.loss 68 | loss.backward() 69 | optimizer.step() 70 | 71 | pbar.update() 72 | pbar.set_description(f'loss:{loss.item():.4f}') 73 | 74 | dev_loss = 0 75 | for data in tqdm(dev_data): 76 | input_ids = data['source'].to(device) 77 | attention_mask = data['attention_mask'].to(device) 78 | labels = data['target'].to(device) 79 | with torch.no_grad(): 80 | outputs = model(input_ids.long(), attention_mask, labels=labels.long()) 81 | dev_loss += outputs.loss.item() 82 | print('dev loss:', dev_loss / len(dev_data)) 83 | 84 | torch.save(model, 'model.bin') 85 | 86 | 87 | if __name__ == '__main__': 88 | train() 89 | -------------------------------------------------------------------------------- /rank/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from utils import cos_sim 3 | import numpy as np 4 | import jieba 5 | from collections import Counter 6 | from rank import rank 7 | from text_representation.sentence_embedding import SentenceEmbedding 8 | 9 | model = SentenceEmbedding() 10 | data = pd.read_csv('../data/rank/qa_data.csv') 11 | question = data['question'].values 12 | embedding = model.encode(data['question'].tolist()) 13 | 14 | 15 | class BM25: 16 | def __init__(self, documents_list, k1=2, k2=1, b=0.75): 17 | self.documents_list = documents_list 18 | self.documents_number = len(documents_list) 19 | self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number 20 | self.f = [] 21 | self.idf = {} 22 | self.k1 = k1 23 | self.k2 = k2 24 | self.b = b 25 | self.init() 26 | 27 | def init(self): 28 | df = {} 29 | for document in self.documents_list: 30 | temp = {} 31 | for word in document: 32 | temp[word] = temp.get(word, 0) + 1 33 | self.f.append(temp) 34 | for key in temp.keys(): 35 | df[key] = df.get(key, 0) + 1 36 | for key, value in df.items(): 37 | self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5)) 38 | 39 | def get_score(self, index, query): 40 | score = 0.0 41 | document_len = len(self.f[index]) 42 | qf = Counter(query) 43 | for q in query: 44 | if q not in self.f[index]: 45 | continue 46 | score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / ( 47 | self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * ( 48 | qf[q] * (self.k2 + 1) / (qf[q] + self.k2)) 49 | 50 | return score 51 | 52 | def get_documents_score(self, query): 53 | query = list(jieba.cut(query)) 54 | score_list = [] 55 | for i in range(self.documents_number): 56 | score_list.append(self.get_score(i, query)) 57 | return score_list 58 | 59 | 60 | question_seg = [] 61 | for q in question: 62 | seg = list(jieba.cut(q)) 63 | question_seg.append(seg) 64 | bm = BM25(question_seg) 65 | 66 | 67 | def word_recall(text): 68 | score = bm.get_documents_score(text) 69 | index = np.argsort(-np.array(score))[:10] 70 | candidate = question[index] 71 | return candidate 72 | 73 | 74 | def embedding_recall(text): 75 | e = model.encode(text) 76 | sim = cos_sim(e, embedding)[0] 77 | index = np.argsort(-sim)[:10] 78 | candidate = question[index] 79 | return candidate 80 | 81 | 82 | if __name__ == '__main__': 83 | 84 | while 1: 85 | text = input('text:') 86 | res1 = list(embedding_recall(text)) 87 | print(res1) 88 | res2 = list(word_recall(text)) 89 | print(res2) 90 | res1.extend(res2) 91 | recall_data = list(set(res1)) 92 | rank(text, recall_data) 93 | -------------------------------------------------------------------------------- /rank/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.utils.checkpoint 4 | from typing import Optional 5 | from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel 6 | from pytorchltr.loss import LambdaNDCGLoss1, PairwiseLogisticLoss 7 | 8 | 9 | class BertForNDCG(BertPreTrainedModel): 10 | def __init__(self, config): 11 | super().__init__(config) 12 | self.num_labels = config.num_labels 13 | self.config = config 14 | 15 | self.bert = BertModel(config) 16 | classifier_dropout = ( 17 | config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob 18 | ) 19 | self.dropout = nn.Dropout(classifier_dropout) 20 | self.dense = nn.Linear(config.hidden_size, 1) 21 | self.post_init() 22 | 23 | def forward( 24 | self, 25 | input_ids: Optional[torch.Tensor] = None, 26 | attention_mask: Optional[torch.Tensor] = None, 27 | token_type_ids: Optional[torch.Tensor] = None, 28 | position_ids: Optional[torch.Tensor] = None, 29 | head_mask: Optional[torch.Tensor] = None, 30 | inputs_embeds: Optional[torch.Tensor] = None, 31 | output_attentions: Optional[bool] = None, 32 | output_hidden_states: Optional[bool] = None, 33 | return_dict: Optional[bool] = None, 34 | num=None): 35 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 36 | 37 | outputs = self.bert( 38 | input_ids, 39 | attention_mask=attention_mask, 40 | token_type_ids=token_type_ids, 41 | position_ids=position_ids, 42 | head_mask=head_mask, 43 | inputs_embeds=inputs_embeds, 44 | output_attentions=output_attentions, 45 | output_hidden_states=output_hidden_states, 46 | return_dict=return_dict, 47 | ) 48 | 49 | pooled_output = outputs[1] 50 | 51 | pooled_output = self.dropout(pooled_output) 52 | logits = self.dense(pooled_output) 53 | 54 | if num is None: 55 | return logits 56 | 57 | loss_fct = LambdaNDCGLoss1() 58 | 59 | score = logits.view(-1, num) 60 | batch = score.shape[0] 61 | 62 | label = torch.arange(5, 0, -1).squeeze(0).repeat(batch, 1).to(logits.device) 63 | n = torch.Tensor([num] * batch).to(logits.device) 64 | loss = loss_fct(score, label, n) 65 | 66 | loss = torch.mean(loss) 67 | return loss, score 68 | -------------------------------------------------------------------------------- /rank/rank.py: -------------------------------------------------------------------------------- 1 | from utils import get_device 2 | import torch 3 | from transformers import BertTokenizer 4 | from model import BertForNDCG 5 | import numpy as np 6 | 7 | device = get_device() 8 | model_path = 'E:\\ptm\\roberta' 9 | 10 | tokenizer = BertTokenizer.from_pretrained(model_path) 11 | model = BertForNDCG.from_pretrained(model_path) 12 | model.load_state_dict(torch.load('best_model.bin', map_location=device)) 13 | model.to(device) 14 | model = model.eval() 15 | 16 | 17 | def inference(text1, text2): 18 | encoding = tokenizer([text1] * len(text2), 19 | text2, 20 | max_length=128, 21 | truncation=True, 22 | padding=True, 23 | return_tensors='pt') 24 | 25 | with torch.no_grad(): 26 | res = model(**encoding.to(device)) 27 | logits = res.cpu().numpy().flatten() 28 | return logits 29 | 30 | 31 | def rank(query, docunment): 32 | res = inference(query, docunment) 33 | index = np.argsort(-res)[:5] 34 | print(np.array(docunment)[index]) 35 | return index 36 | 37 | -------------------------------------------------------------------------------- /rank/train_ndcg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from transformers import BertTokenizer 3 | from model import BertForNDCG 4 | import pandas as pd 5 | import torch 6 | from utils import get_device 7 | from tqdm import tqdm 8 | from torch.utils.data import DataLoader, Dataset 9 | 10 | 11 | def dcg(score): 12 | index = list(range(1, len(score[0]) + 1)) 13 | return score[:, 0] + np.sum(score[:, 1:] / np.log2(index[1:]), axis=1) 14 | 15 | 16 | def ndcg(score): 17 | if not isinstance(score, np.ndarray): 18 | score = np.array(score) 19 | if score.ndim == 1: 20 | score = score[None, :] 21 | dcg_score = dcg(score) 22 | idcg_score = dcg(np.sort(score[0][None, :])[:, ::-1]) 23 | ndcg_socre = dcg_score / idcg_score 24 | return ndcg_socre 25 | 26 | 27 | class BaseDataset(Dataset): 28 | def __init__(self, encodings, labels=None): 29 | self.encodings = encodings 30 | self.labels = labels 31 | 32 | def __getitem__(self, idx): 33 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 34 | if self.labels is not None: 35 | item['labels'] = torch.tensor(self.labels[idx]) 36 | return item 37 | 38 | def __len__(self): 39 | return len(self.encodings['input_ids']) 40 | 41 | 42 | batch_size = 20 43 | device = get_device() 44 | path = 'E:\\ptm\\roberta' 45 | # path = ptm_path('roberta') 46 | tokenizer = BertTokenizer.from_pretrained(path) 47 | model = BertForNDCG.from_pretrained(path).to(device) 48 | 49 | data = pd.read_csv('../data/rank/sort_data.csv') 50 | text = data['text'].tolist() 51 | all_text = [] 52 | query = [] 53 | document = [] 54 | 55 | q = '' 56 | for i, t in enumerate(text): 57 | if i % 5 == 0: 58 | q = t 59 | query.append(q) 60 | document.append(t) 61 | encoding = tokenizer(query[:-1000], document[:-1000], truncation=True, padding=True, max_length=128, 62 | return_tensors='pt') 63 | encoding_dev = tokenizer(query[-1000:], document[-1000:], truncation=True, padding=True, max_length=64, 64 | return_tensors='pt') 65 | 66 | train_loader = DataLoader(BaseDataset(encoding), batch_size=batch_size) 67 | dev_loader = DataLoader(BaseDataset(encoding_dev), batch_size=batch_size) 68 | 69 | 70 | def dev_func(): 71 | model.eval() 72 | all_ndcg = [] 73 | with torch.no_grad(): 74 | for data in tqdm(dev_loader): 75 | outputs = model(input_ids=data['input_ids'].to(device), 76 | attention_mask=data['attention_mask'].to(device), 77 | token_type_ids=data['token_type_ids'].to(device), 78 | num=5) 79 | logits = outputs[1] 80 | score = torch.argsort(logits) + 1 81 | a = ndcg(score.cpu().numpy()) 82 | all_ndcg.extend(a) 83 | ndcg_score = np.mean(all_ndcg) 84 | print('ndcg:', ndcg_score) 85 | return ndcg_score 86 | 87 | 88 | opt = torch.optim.Adam(lr=5e-5, params=model.parameters()) 89 | best_ndcg = 0 90 | for epoch in range(10): 91 | model.train() 92 | pbar = tqdm(train_loader) 93 | for data in pbar: 94 | opt.zero_grad() 95 | outputs = model(input_ids=data['input_ids'].to(device), 96 | attention_mask=data['attention_mask'].to(device), 97 | token_type_ids=data['token_type_ids'].to(device), 98 | num=5) 99 | loss, score = outputs[0], outputs[1] 100 | loss.backward() 101 | opt.step() 102 | 103 | pbar.update() 104 | pbar.set_description(f'loss:{loss.item():.4f}') 105 | 106 | cur_ndcg = dev_func() 107 | if cur_ndcg > best_ndcg: 108 | best_ndcg = cur_ndcg 109 | torch.save(model.state_dict(), 'best_model.bin') 110 | -------------------------------------------------------------------------------- /text_classification/bert.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | from transformers import BertForSequenceClassification, BertTokenizer 3 | from torch.utils.data import DataLoader, Dataset 4 | from tqdm import tqdm 5 | from utils import fix_seed 6 | import torch 7 | 8 | path = 'E:\\ptm\\roberta' 9 | tokenizer = BertTokenizer.from_pretrained(path) 10 | 11 | 12 | class BaseDataset(Dataset): 13 | def __init__(self, encodings, labels=None): 14 | self.encodings = encodings 15 | self.labels = labels 16 | 17 | def __getitem__(self, idx): 18 | item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} 19 | if self.labels is not None: 20 | item['labels'] = torch.tensor(self.labels[idx]) 21 | return item 22 | 23 | def __len__(self): 24 | return len(self.encodings['input_ids']) 25 | 26 | 27 | def load_data(batch_size=32): 28 | train_text = [] 29 | train_label = [] 30 | with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file: 31 | for line in file.readlines(): 32 | t, l = line.strip().split('\t') 33 | train_text.append(t) 34 | train_label.append(int(l)) 35 | 36 | train_text = tokenizer(text=train_text, 37 | return_tensors='pt', 38 | truncation=True, 39 | padding=True, 40 | max_length=10) 41 | 42 | train_loader = DataLoader(BaseDataset(train_text, train_label), 43 | batch_size, 44 | pin_memory=True if torch.cuda.is_available() else False, 45 | shuffle=False) 46 | 47 | dev_text = [] 48 | dev_label = [] 49 | with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file: 50 | for line in file.readlines(): 51 | t, l = line.strip().split('\t') 52 | dev_text.append(t) 53 | dev_label.append(int(l)) 54 | 55 | dev_text = tokenizer(text=dev_text, 56 | return_tensors='pt', 57 | truncation=True, 58 | padding=True, 59 | max_length=10) 60 | 61 | dev_loader = DataLoader(BaseDataset(dev_text, dev_label), 62 | batch_size, 63 | pin_memory=True if torch.cuda.is_available() else False, 64 | shuffle=False) 65 | 66 | return train_loader, dev_loader 67 | 68 | 69 | # 训练模型 70 | def train(): 71 | fix_seed() 72 | 73 | train_data_loader, dev_data_loader = load_data(128) 74 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 75 | model = BertForSequenceClassification.from_pretrained(path, num_labels=2) 76 | model = model.to(device) 77 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) 78 | 79 | for epoch in range(5): 80 | print('epoch:', epoch + 1) 81 | pred = [] 82 | label = [] 83 | pbar = tqdm(train_data_loader) 84 | for data in pbar: 85 | optimizer.zero_grad() 86 | 87 | input_ids = data['input_ids'].to(device) 88 | attention_mask = data['attention_mask'].to(device) 89 | labels = data['labels'].to(device).long() 90 | 91 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 92 | output = outputs.logits.argmax(1).cpu().numpy() 93 | pred.extend(output) 94 | label.extend(labels.cpu().numpy()) 95 | loss = outputs.loss 96 | loss.backward() 97 | 98 | optimizer.step() 99 | 100 | pbar.update() 101 | pbar.set_description(f'loss:{loss.item():.4f}') 102 | 103 | acc = accuracy_score(pred, label) 104 | print('train acc:', acc) 105 | 106 | pred = [] 107 | label = [] 108 | for data in tqdm(dev_data_loader): 109 | input_ids = data['input_ids'].to(device) 110 | attention_mask = data['attention_mask'].to(device) 111 | labels = data['labels'].to(device).long() 112 | with torch.no_grad(): 113 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 114 | output = outputs.logits.argmax(1).cpu().numpy() 115 | pred.extend(output) 116 | label.extend(labels.cpu().numpy()) 117 | acc = accuracy_score(pred, label) 118 | print('dev acc:', acc) 119 | print() 120 | 121 | 122 | if __name__ == '__main__': 123 | train() 124 | -------------------------------------------------------------------------------- /text_classification/text_classification.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import jieba 4 | import numpy as np 5 | from collections import defaultdict 6 | from torch.utils.data import DataLoader, TensorDataset 7 | from sklearn.metrics import accuracy_score 8 | from utils import fix_seed 9 | 10 | 11 | class TextCLS(torch.nn.Module): 12 | # 准备我们需要用到的参数和layer 13 | def __init__(self, 14 | vocab_size, 15 | embedding_size): 16 | super().__init__() 17 | self.embedding = nn.Embedding(vocab_size, embedding_size) 18 | # [batch_size, seq_len, hidden_size] 19 | self.lstm = nn.LSTM(input_size=embedding_size, 20 | hidden_size=256, 21 | num_layers=2, 22 | batch_first=True) 23 | self.dense1 = nn.Linear(256, 100) 24 | self.dense2 = nn.Linear(100, 2) 25 | 26 | # 前向传播,那我们准备好的layer拼接在一起 27 | def forward(self, x): 28 | embedding = self.embedding(x) 29 | # [batch_size, seq_len, hidden_size] 30 | out, _ = self.lstm(embedding) 31 | out = self.dense1(out[:, -1, :]) 32 | out = self.dense2(out) 33 | return out 34 | 35 | 36 | def tokenize(string): 37 | res = list(jieba.cut(string, cut_all=False)) 38 | return res 39 | 40 | 41 | # 把数据转换成index 42 | def seq2index(seq, vocab): 43 | seg = tokenize(seq) 44 | seg_index = [] 45 | for s in seg: 46 | seg_index.append(vocab.get(s, 1)) 47 | return seg_index 48 | 49 | 50 | # 统一长度 51 | def padding_seq(X, max_len=10): 52 | return np.array([ 53 | np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X 54 | ]) 55 | 56 | 57 | def load_data(batch_size=32): 58 | train_text = [] 59 | train_label = [] 60 | with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file: 61 | for line in file.readlines(): 62 | t, l = line.strip().split('\t') 63 | train_text.append(t) 64 | train_label.append(int(l)) 65 | 66 | dev_text = [] 67 | dev_label = [] 68 | with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file: 69 | for line in file.readlines(): 70 | t, l = line.strip().split('\t') 71 | dev_text.append(t) 72 | dev_label.append(int(l)) 73 | 74 | # 生成词典 75 | segment = [tokenize(t) for t in train_text] 76 | 77 | word_frequency = defaultdict(int) 78 | for row in segment: 79 | for i in row: 80 | word_frequency[i] += 1 81 | 82 | word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True) # 根据词频降序排序 83 | 84 | vocab = {'[PAD]': 0, '[UNK]': 1} 85 | for d in word_sort: 86 | vocab[d[0]] = len(vocab) 87 | 88 | train_x = padding_seq([seq2index(t, vocab) for t in train_text]) 89 | train_y = np.array(train_label) 90 | train_data_set = TensorDataset(torch.from_numpy(train_x), 91 | torch.from_numpy(train_y)) 92 | train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size) 93 | 94 | dev_x = padding_seq([seq2index(t, vocab) for t in dev_text]) 95 | dev_y = np.array(dev_label) 96 | dev_data_set = TensorDataset(torch.from_numpy(dev_x), 97 | torch.from_numpy(dev_y)) 98 | dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size) 99 | 100 | return train_data_loader, dev_data_loader, vocab 101 | 102 | 103 | # 训练模型 104 | def train(): 105 | fix_seed() 106 | 107 | train_data_loader, dev_data_loader, vocab = load_data(128) 108 | model = TextCLS(vocab_size=len(vocab), 109 | embedding_size=100) 110 | 111 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 112 | loss_func = nn.CrossEntropyLoss() 113 | 114 | if torch.cuda.is_available(): 115 | model = model.cuda() 116 | 117 | for epoch in range(5): 118 | print('epoch:', epoch + 1) 119 | pred = [] 120 | label = [] 121 | for step, (b_x, b_y) in enumerate(train_data_loader): 122 | if torch.cuda.is_available(): 123 | b_x = b_x.cuda().long() 124 | b_y = b_y.cuda().long() 125 | output = model(b_x) 126 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 127 | label.extend(b_y.cpu().numpy()) 128 | loss = loss_func(output, b_y) 129 | optimizer.zero_grad() 130 | # 求解梯度 131 | loss.backward() 132 | # 更新我们的权重 133 | optimizer.step() 134 | acc = accuracy_score(pred, label) 135 | print('train acc:', acc) 136 | 137 | pred = [] 138 | label = [] 139 | for step, (b_x, b_y) in enumerate(dev_data_loader): 140 | if torch.cuda.is_available(): 141 | b_x = b_x.cuda().long() 142 | b_y = b_y.cuda().long() 143 | with torch.no_grad(): 144 | output = model(b_x) 145 | pred.extend(torch.argmax(output, dim=1).cpu().numpy()) 146 | label.extend(b_y.cpu().numpy()) 147 | acc = accuracy_score(pred, label) 148 | print('dev acc:', acc) 149 | print() 150 | 151 | 152 | if __name__ == '__main__': 153 | train() 154 | -------------------------------------------------------------------------------- /text_representation/sentence_embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from tqdm import tqdm 4 | from transformers import BertModel, BertTokenizer 5 | 6 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 7 | 8 | 9 | class SentenceEmbedding: 10 | def __init__(self): 11 | model_path = 'E:\\ptm\\simbert' 12 | self.model = BertModel.from_pretrained(model_path) 13 | self.model.to(device) 14 | self.model.eval() 15 | self.tokenizer = BertTokenizer.from_pretrained(model_path) 16 | 17 | def encode(self, content, batch_size=256, max_length=None, padding='max_length'): 18 | outputs = None 19 | if isinstance(content, list) and len(content) > batch_size: 20 | for epoch in tqdm(range(len(content) // batch_size + 1)): 21 | batch_content = content[epoch * batch_size:(epoch + 1) * batch_size] 22 | if batch_content: 23 | output = self._embedding(batch_content, max_length, padding) 24 | if outputs is None: 25 | outputs = output 26 | else: 27 | outputs = np.concatenate([outputs, output], axis=0) 28 | return outputs 29 | else: 30 | return self._embedding(content, max_length, padding) 31 | 32 | def _embedding(self, content, max_length, padding): 33 | 34 | if max_length is None: 35 | if isinstance(content, str): 36 | max_length = len(content) + 2 37 | else: 38 | max_length = max([len(c) for c in content]) + 2 39 | max_length = min(max_length, 512) 40 | inputs = self.tokenizer(content, 41 | return_tensors="pt", 42 | truncation=True, 43 | padding=padding, 44 | max_length=max_length) 45 | with torch.no_grad(): 46 | outputs = self.model(**inputs.to(device)) 47 | output = outputs[1].cpu().numpy() 48 | 49 | return output 50 | -------------------------------------------------------------------------------- /text_representation/synonym.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gensim 3 | 4 | model = gensim.models.Word2Vec.load('word2vec/wiki.ptm') 5 | embedding = model.wv 6 | 7 | 8 | def cosine(a, b): 9 | return np.matmul(a, b.T) / np.linalg.norm(a) / np.linalg.norm(b, axis=-1) 10 | 11 | 12 | def search(word, topk=3): 13 | we = embedding[word] 14 | similarity = cosine(we, embedding.vectors) 15 | index = np.argsort(-similarity) 16 | w = np.array(embedding.index2word)[index[0:topk]] 17 | print(w) 18 | 19 | 20 | if __name__ == '__main__': 21 | while 1: 22 | text = input('word:') 23 | search(text) 24 | -------------------------------------------------------------------------------- /text_representation/word2vec/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/text_representation/word2vec/.gitkeep -------------------------------------------------------------------------------- /text_representation/word2vec_gensim.py: -------------------------------------------------------------------------------- 1 | from gensim.models import Word2Vec 2 | from gensim.models.word2vec import LineSentence 3 | import multiprocessing 4 | 5 | input_file = 'word2vec/wiki.txt' 6 | out_file = 'word2vec/wiki.model' 7 | 8 | model = Word2Vec(LineSentence(input_file), 9 | size=100, 10 | window=5, 11 | min_count=5, 12 | workers=multiprocessing.cpu_count(), 13 | sg=1, 14 | hs=0, 15 | negative=5) 16 | 17 | model.save(out_file) 18 | -------------------------------------------------------------------------------- /text_similarity/dssm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class DSSM(nn.Module): 6 | def __init__(self, 7 | char_vocab_size, 8 | char_dim=100, 9 | hidden_size=128): 10 | super(DSSM, self).__init__() 11 | 12 | self.char_embedding = nn.Embedding(char_vocab_size, char_dim) 13 | 14 | self.fc1 = nn.Linear(100, hidden_size) 15 | self.fc2 = nn.Linear(hidden_size, hidden_size) 16 | 17 | self.dropout = nn.Dropout(0.2) 18 | 19 | def forward(self, char_p, char_q): 20 | p_embedding = self.char_embedding(char_p.long()) 21 | q_embedding = self.char_embedding(char_q.long()) 22 | 23 | p = torch.tanh(self.fc1(p_embedding)) 24 | q = torch.tanh(self.fc1(q_embedding)) 25 | p = self.dropout(p) 26 | q = self.dropout(q) 27 | p = self.fc2(p) 28 | q = self.fc2(q) 29 | 30 | p = torch.mean(p, dim=1) 31 | q = torch.mean(q, dim=1) 32 | 33 | cosine = torch.cosine_similarity(p, q) 34 | cosine[cosine < 0] = 0 35 | 36 | return cosine 37 | -------------------------------------------------------------------------------- /text_similarity/esim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ESIM(nn.Module): 6 | def __init__(self, 7 | vocab_size, 8 | embedding_size=100, 9 | hidden_size=128, 10 | max_len=10): 11 | super(ESIM, self).__init__() 12 | 13 | # Word Representation Layer 14 | self.char_embedding = nn.Embedding(vocab_size, embedding_size) 15 | # self.word_embedding = nn.Embedding(word_vocab_size, word_dim) 16 | 17 | self.char_LSTM = nn.LSTM( 18 | input_size=embedding_size, 19 | hidden_size=hidden_size, 20 | num_layers=1, 21 | bidirectional=True, 22 | batch_first=True) 23 | 24 | # Context Representation Layer 25 | self.context_LSTM = nn.LSTM( 26 | input_size=hidden_size * 8, 27 | hidden_size=hidden_size, 28 | num_layers=1, 29 | bidirectional=True, 30 | batch_first=True) 31 | 32 | # ----- Prediction Layer ----- 33 | self.max_pool1 = nn.MaxPool2d((max_len, 1)) 34 | self.max_pool2 = nn.MaxPool2d((max_len, 1)) 35 | 36 | self.fc1 = nn.Linear(hidden_size * 8, hidden_size) 37 | self.fc2 = nn.Linear(hidden_size, 2) 38 | 39 | self.dropout = nn.Dropout(0.2) 40 | 41 | def forward(self, char_p, char_q): 42 | p_embedding, _ = self.char_LSTM(self.char_embedding(char_p.long())) 43 | q_embedding, _ = self.char_LSTM(self.char_embedding(char_q.long())) 44 | 45 | p_embedding = self.dropout(p_embedding) 46 | q_embedding = self.dropout(q_embedding) 47 | 48 | # attention 49 | e = torch.matmul(p_embedding, torch.transpose(q_embedding, 1, 2)) 50 | p_hat = torch.matmul(torch.softmax(e, dim=2), q_embedding) 51 | q_hat = torch.matmul(torch.transpose(torch.softmax(e, dim=1), 1, 2), p_embedding) 52 | 53 | p_cat = torch.cat([p_embedding, p_hat, p_embedding - p_hat, p_embedding * p_hat], dim=2) 54 | q_cat = torch.cat([q_embedding, q_hat, q_embedding - q_hat, q_embedding * q_hat], dim=2) 55 | 56 | p, _ = self.context_LSTM(p_cat) 57 | q, _ = self.context_LSTM(q_cat) 58 | 59 | p_max = self.max_pool1(p).squeeze(dim=1) 60 | q_max = self.max_pool2(q).squeeze(dim=1) 61 | 62 | p_mean = torch.mean(p, dim=1) 63 | q_mean = torch.mean(q, dim=1) 64 | 65 | x = torch.cat([p_max, q_max, p_mean, q_mean], dim=1) 66 | x = self.dropout(x) 67 | 68 | # ----- Prediction Layer ----- 69 | x = torch.tanh(self.fc1(x)) 70 | x = self.dropout(x) 71 | x = self.fc2(x) 72 | return x 73 | -------------------------------------------------------------------------------- /text_similarity/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from collections import defaultdict 4 | from torch.utils.data import DataLoader, TensorDataset 5 | from sklearn.metrics import accuracy_score 6 | from text_similarity.esim import ESIM 7 | from text_similarity.dssm import DSSM 8 | from utils import * 9 | import pandas as pd 10 | from tqdm import tqdm 11 | 12 | 13 | def load_data(batch_size=32): 14 | train = pd.read_csv('../data/LCQMC/lcqmc_train.csv') 15 | dev = pd.read_csv('../data/LCQMC/lcqmc_dev.csv') 16 | 17 | text = train['sentence1'].tolist() 18 | text.extend(train['sentence2'].tolist()) 19 | text.extend(dev['sentence1'].tolist()) 20 | text.extend(dev['sentence2'].tolist()) 21 | 22 | # 生成词典 23 | segment = [tokenize(t) for t in text] 24 | 25 | word_frequency = defaultdict(int) 26 | for row in segment: 27 | for i in row: 28 | word_frequency[i] += 1 29 | 30 | word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True) # 根据词频降序排序 31 | 32 | vocab = {'[PAD]': 0, '[UNK]': 1} 33 | for d in word_sort: 34 | vocab[d[0]] = len(vocab) 35 | 36 | train_x1 = padding_seq([seq2index(t, vocab) for t in train['sentence1'].tolist()]) 37 | train_x2 = padding_seq([seq2index(t, vocab) for t in train['sentence2'].tolist()]) 38 | train_data_set = TensorDataset(torch.from_numpy(train_x1), 39 | torch.from_numpy(train_x2), 40 | torch.from_numpy(train['label'].values)) 41 | train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size) 42 | 43 | dev_x1 = padding_seq([seq2index(t, vocab) for t in dev['sentence1'].tolist()]) 44 | dev_x2 = padding_seq([seq2index(t, vocab) for t in dev['sentence2'].tolist()]) 45 | dev_data_set = TensorDataset(torch.from_numpy(dev_x1), 46 | torch.from_numpy(dev_x2), 47 | torch.from_numpy(dev['label'].values)) 48 | dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size) 49 | 50 | return train_data_loader, dev_data_loader, vocab 51 | 52 | 53 | # 训练模型 54 | def train(): 55 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 56 | train_data_loader, dev_data_loader, vocab = load_data(32) 57 | model = ESIM(vocab_size=len(vocab), 58 | embedding_size=100, 59 | hidden_size=128, 60 | max_len=10) 61 | # model = DSSM(vocab_len=len(vocab), 62 | # embedding_size=100, 63 | # hidden_size=128) 64 | model = model.to(device) 65 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 66 | # loss_func = nn.BCELoss() 67 | loss_func = nn.CrossEntropyLoss() 68 | 69 | for epoch in range(5): 70 | pred = [] 71 | label = [] 72 | for step, (x1, x2, y) in tqdm(enumerate(train_data_loader)): 73 | x1 = x1.to(device) 74 | x2 = x2.to(device) 75 | y = y.to(device) 76 | 77 | # 前向传播 78 | output = model(x1.long(), x2.long()) 79 | loss = loss_func(output, y) 80 | optimizer.zero_grad() 81 | 82 | pred.extend(torch.argmax(output.detach().cpu(), dim=1).numpy()) 83 | label.extend(y.cpu().numpy()) 84 | 85 | # 反向传播 86 | loss.backward() 87 | # 更新我们的权重 88 | optimizer.step() 89 | acc = accuracy_score(pred, label) 90 | print('train acc:', acc) 91 | 92 | pred = [] 93 | label = [] 94 | for step, (x1, x2, y) in tqdm(enumerate(dev_data_loader)): 95 | x1 = x1.to(device) 96 | x2 = x2.to(device) 97 | y = y.to(device) 98 | with torch.no_grad(): 99 | output = model(x1.long(), x2.long()) 100 | pred.extend(torch.argmax(output.detach().cpu(), dim=1).numpy()) 101 | label.extend(y.cpu().numpy()) 102 | acc = accuracy_score(pred, label) 103 | print('dev acc:', acc) 104 | 105 | 106 | if __name__ == '__main__': 107 | train() 108 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | import numpy as np 3 | import os 4 | import random 5 | import torch 6 | 7 | 8 | def tokenize(string): 9 | res = list(jieba.cut(string, cut_all=False)) 10 | return res 11 | 12 | 13 | # 把数据转换成index 14 | def seq2index(seq, vocab): 15 | seg = tokenize(seq) 16 | seg_index = [] 17 | for s in seg: 18 | seg_index.append(vocab.get(s, 1)) 19 | return seg_index 20 | 21 | 22 | # 统一长度 23 | def padding_seq(X, max_len=10): 24 | return np.array([ 25 | np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X 26 | ]) 27 | 28 | 29 | def fix_seed(seed=3407): 30 | random.seed(seed) 31 | os.environ['PYTHONHASHSEED'] = str(seed) 32 | np.random.seed(seed) 33 | torch.manual_seed(seed) 34 | torch.cuda.manual_seed(seed) 35 | torch.backends.cudnn.deterministic = True 36 | 37 | 38 | def random_mask(input_ids, tokenizer): 39 | length = len(input_ids) 40 | # 移除pad cls sep 41 | input_ids = input_ids[1:-1] 42 | prob = np.random.random(len(input_ids)) 43 | source, target = [], [] 44 | # cls:[101] 45 | source.append(101) 46 | target.append(-100) 47 | # p->[0:1] 48 | for p, ids in zip(prob, input_ids): 49 | if p < 0.15 * 0.8: 50 | source.append(tokenizer.mask_token_id) 51 | target.append(ids) 52 | elif p < 0.15 * 0.9: 53 | source.append(ids) 54 | target.append(ids) 55 | elif p < 0.15: 56 | source.append(np.random.choice(tokenizer.vocab_size)) 57 | target.append(ids) 58 | else: 59 | source.append(ids) 60 | target.append(-100) 61 | # sep:[102] 62 | source.append(102) 63 | target.append(-100) 64 | while len(source) < length: 65 | source.append(0) 66 | target.append(-100) 67 | return source, target 68 | 69 | 70 | def punctuation(): 71 | import string 72 | en_punctuation = list(string.punctuation) 73 | zh_punctuation = [',', '。', ':', '!', '?', '《', '》', '"', ';', "'"] 74 | return en_punctuation + zh_punctuation 75 | 76 | 77 | def get_device(): 78 | return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 79 | 80 | 81 | def one_hot(x, n_class): 82 | return torch.nn.functional.one_hot(x, num_classes=n_class) 83 | 84 | 85 | def cos_sim(a, b): 86 | a = np.array(a) 87 | b = np.array(b) 88 | if len(a.shape) == 1: 89 | a = a[None, :] 90 | if len(b.shape) == 1: 91 | b = b[None, :] 92 | 93 | a_norm = a / np.linalg.norm(a, axis=-1)[:, None] 94 | b_norm = b / np.linalg.norm(b, axis=-1)[:, None] 95 | cosine = np.matmul(a_norm, b_norm.T) 96 | return cosine 97 | --------------------------------------------------------------------------------