├── .gitignore
├── LICENSE
├── README.md
├── adversarial_training
    ├── adversarial.py
    ├── bert_at.py
    └── lstm_at.py
├── data
    ├── LCQMC
    │   ├── lcqmc_dev.csv
    │   └── lcqmc_train.csv
    ├── rank
    │   ├── qa_data.csv
    │   └── sort_data.csv
    ├── sentiment
    │   ├── sentiment.test.data
    │   ├── sentiment.train.data
    │   └── sentiment.valid.data
    └── tnews_public
    │   ├── dev.csv
    │   └── train.csv
├── data_augmentation
    ├── bert_mixup.py
    ├── data_augmentation.py
    ├── deepl.py
    ├── feature.py
    └── feature_augmentation.py
├── distillation
    ├── distillation_student.py
    ├── train_student.py
    └── train_teacher.py
├── elmoformanylangs
    ├── __init__.py
    ├── __main__.py
    ├── biLM.py
    ├── configs
    │   ├── cnn_0_100_512_4096_sample.json
    │   └── cnn_50_100_512_4096_sample.json
    ├── dataloader.py
    ├── elmo.py
    ├── frontend.py
    ├── main.py
    ├── modules
    │   ├── __init__.py
    │   ├── classify_layer.py
    │   ├── elmo.py
    │   ├── embedding_layer.py
    │   ├── encoder_base.py
    │   ├── highway.py
    │   ├── lstm.py
    │   ├── lstm_cell_with_projection.py
    │   ├── token_embedder.py
    │   └── util.py
    └── utils.py
├── gpt
    ├── chat.py
    ├── chitchat
    │   ├── __init__.py
    │   ├── config
    │   │   └── model_config_dialogue_small.json
    │   ├── data
    │   │   └── .gitkeep
    │   ├── dataset.py
    │   ├── generate_dialogue_subset.py
    │   ├── interact.py
    │   ├── interact_mmi.py
    │   ├── train.py
    │   └── vocabulary
    │   │   └── vocab_small.txt
    └── gpt_lyric.py
├── pseudo
    ├── first_stage.py
    └── second_stage.py
├── ptm
    ├── .gitkeep
    ├── post train_bert.py
    ├── post train_gpt.py
    └── train_bert.py
├── rank
    ├── main.py
    ├── model.py
    ├── rank.py
    └── train_ndcg.py
├── text_classification
    ├── bert.py
    └── text_classification.py
├── text_representation
    ├── sentence_embedding.py
    ├── synonym.py
    ├── word2vec
    │   └── .gitkeep
    └── word2vec_gensim.py
├── text_similarity
    ├── dssm.py
    ├── esim.py
    └── train.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .idea
132 | text_representation/word2vec/wiki.model
133 | text_representation/word2vec/wiki.model.trainables.syn1neg.npy
134 | text_representation/word2vec/wiki.model.wv.vectors.npy
135 | text_representation/word2vec/wiki.txt
136 | 
137 | 
138 | ptm/elmo
139 | 
140 | gpt/chitchat/dialogue_model
141 | distillation/model.bin
142 | distillation/student.bin
143 | distillation/teacher.bin
144 | pseudo/pseudo.csv
145 | rank/best_model.bin
146 | rank/test.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nlp_tutorial
 2 | 
 3 | ### 文本表示
 4 | 
 5 | 1、词向量训练数据       
 6 | 链接：https://pan.baidu.com/s/1cWt2qqH5ym0vLcjihehfGg 提取码：z6dt
 7 | 
 8 | 2、中文版本的ELMo模型         
 9 | https://pan.baidu.com/s/1RNKnj6hgL-2orQ7f38CauA
10 | 
11 | 3、哈工大开源的多语言ELMo源码     
12 | https://github.com/HIT-SCIR/ELMoForManyLangs
13 | 
14 | 
15 | ### 文本匹配模型
16 | 
17 | 我自己整理的各种匹配模型
18 | https://blog.csdn.net/u012526436/article/details/90179466
19 | 
20 | ### huggingface - transformers
21 | 
22 | 模型仓库    
23 | https://huggingface.co/models
24 | 
25 | transformers    
26 | https://github.com/huggingface/transformers
27 | 
28 | ### gpt
29 | 
30 | 闲聊  
31 | https://github.com/yangjianxin1/GPT2-chitchat
32 | 


--------------------------------------------------------------------------------
/adversarial_training/adversarial.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | 
  4 | class AT:
  5 |     def __init__(self, model):
  6 |         self.model = model
  7 |         self.backup = {}
  8 | 
  9 |     def attack(self, emb_name='emb.'):
 10 |         """
 11 |         备份embedding matrix 并添加我们的扰动项
 12 |         :param emb_name: embedding层的名字
 13 |         """
 14 |         raise NotImplemented
 15 | 
 16 |     def restore(self, emb_name='emb.'):
 17 |         """
 18 |         把embedding matrix的参数恢复
 19 |         :param emb_name: embedding层的名字
 20 |         """
 21 |         for name, param in self.model.named_parameters():
 22 |             if param.requires_grad and emb_name in name:
 23 |                 assert name in self.backup
 24 |                 param.data = self.backup[name]
 25 |         self.backup = {}
 26 | 
 27 | 
 28 | class FGM(AT):
 29 |     def attack(self, epsilon=1., emb_name='emb.'):
 30 |         for name, param in self.model.named_parameters():
 31 |             if param.requires_grad and emb_name in name:
 32 |                 self.backup[name] = param.data.clone()
 33 |                 norm = torch.norm(param.grad)
 34 |                 if norm != 0 and not torch.isnan(norm):
 35 |                     r_at = epsilon * param.grad / norm
 36 |                     param.data.add_(r_at)
 37 | 
 38 | 
 39 | class FGSM(AT):
 40 |     def attack(self, epsilon=1., emb_name='emb.'):
 41 |         for name, param in self.model.named_parameters():
 42 |             if param.requires_grad and emb_name in name:
 43 |                 self.backup[name] = param.data.clone()
 44 |                 norm = torch.norm(param.grad)
 45 |                 if norm != 0 and not torch.isnan(norm):
 46 |                     r_at = epsilon * torch.sign(param.grad)
 47 |                     param.data.add_(r_at)
 48 | 
 49 | 
 50 | class FreeAT(AT):
 51 | 
 52 |     def __init__(self, model):
 53 |         super().__init__(model)
 54 |         self.grad_backup = {}
 55 | 
 56 |     def attack(self, epsilon=0.3, alpha=0.01, emb_name='emb.', first_attack=False):
 57 |         for name, param in self.model.named_parameters():
 58 |             if param.requires_grad and emb_name in name:
 59 |                 if first_attack:
 60 |                     self.backup[name] = param.data.clone()
 61 |                 norm = torch.norm(param.grad)
 62 |                 if norm != 0 and not torch.isnan(norm):
 63 |                     # 得到新的扰动
 64 |                     r_at = alpha * param.grad / norm
 65 |                     r_at = torch.clamp(r_at, - epsilon, epsilon)
 66 |                     # 加到输入上
 67 |                     param.data.add_(r_at)
 68 | 
 69 |     def backup_grad(self):
 70 |         for name, param in self.model.named_parameters():
 71 |             if param.requires_grad:
 72 |                 self.grad_backup[name] = param.grad.clone()
 73 | 
 74 |     def restore_grad(self):
 75 |         for name, param in self.model.named_parameters():
 76 |             if param.requires_grad:
 77 |                 param.grad = self.grad_backup[name]
 78 | 
 79 | 
 80 | class FreeLB(AT):
 81 | 
 82 |     def __init__(self, model):
 83 |         super().__init__(model)
 84 |         self.grad_backup = {}
 85 | 
 86 |     def attack(self, epsilon=0.01, alpha=5e-3, emb_name='emb.', first_attack=False):
 87 |         for name, param in self.model.named_parameters():
 88 |             if param.requires_grad and emb_name in name:
 89 |                 if first_attack:
 90 |                     r_at = torch.Tensor(1).uniform_(-epsilon, epsilon)
 91 |                 else:
 92 |                     norm = torch.norm(param.grad)
 93 |                     if norm != 0 and not torch.isnan(norm):
 94 |                         r_at = alpha * param.grad / norm
 95 |                         r_at = torch.clamp(r_at, - epsilon, epsilon)
 96 |                 param.data.add_(r_at)
 97 | 
 98 |     def backup_grad(self):
 99 |         for name, param in self.model.named_parameters():
100 |             if param.requires_grad:
101 |                 self.grad_backup[name] = param.grad.clone()
102 | 
103 |     def restore_grad(self):
104 |         for name, param in self.model.named_parameters():
105 |             if param.requires_grad:
106 |                 param.grad = self.grad_backup[name]
107 | 


--------------------------------------------------------------------------------
/adversarial_training/bert_at.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import accuracy_score
  2 | from transformers import BertForSequenceClassification, BertTokenizer
  3 | from torch.utils.data import DataLoader, Dataset
  4 | from tqdm import tqdm
  5 | from adversarial_training.adversarial import *
  6 | from utils import fix_seed
  7 | 
  8 | tokenizer = BertTokenizer.from_pretrained('E:\\ptm\\roberta')
  9 | 
 10 | 
 11 | class BaseDataset(Dataset):
 12 |     def __init__(self, encodings, labels=None):
 13 |         self.encodings = encodings
 14 |         self.labels = labels
 15 | 
 16 |     def __getitem__(self, idx):
 17 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 18 |         if self.labels is not None:
 19 |             item['labels'] = torch.tensor(self.labels[idx])
 20 |         return item
 21 | 
 22 |     def __len__(self):
 23 |         return len(self.encodings['input_ids'])
 24 | 
 25 | 
 26 | def load_data(batch_size=32):
 27 |     train_text = []
 28 |     train_label = []
 29 |     with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file:
 30 |         for line in file.readlines():
 31 |             t, l = line.strip().split('\t')
 32 |             train_text.append(t)
 33 |             train_label.append(int(l))
 34 | 
 35 |     train_text = tokenizer(text=train_text,
 36 |                            return_tensors='pt',
 37 |                            truncation=True,
 38 |                            padding=True,
 39 |                            max_length=10)
 40 | 
 41 |     train_loader = DataLoader(BaseDataset(train_text, train_label),
 42 |                               batch_size,
 43 |                               pin_memory=True if torch.cuda.is_available() else False,
 44 |                               shuffle=False)
 45 | 
 46 |     dev_text = []
 47 |     dev_label = []
 48 |     with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file:
 49 |         for line in file.readlines():
 50 |             t, l = line.strip().split('\t')
 51 |             dev_text.append(t)
 52 |             dev_label.append(int(l))
 53 | 
 54 |     dev_text = tokenizer(text=dev_text,
 55 |                          return_tensors='pt',
 56 |                          truncation=True,
 57 |                          padding=True,
 58 |                          max_length=10)
 59 | 
 60 |     dev_loader = DataLoader(BaseDataset(dev_text, dev_label),
 61 |                             batch_size,
 62 |                             pin_memory=True if torch.cuda.is_available() else False,
 63 |                             shuffle=False)
 64 | 
 65 |     return train_loader, dev_loader
 66 | 
 67 | 
 68 | # 训练模型
 69 | def train():
 70 |     fix_seed()
 71 | 
 72 |     train_data_loader, dev_data_loader = load_data(32)
 73 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 74 |     model = BertForSequenceClassification.from_pretrained('E:\\ptm\\roberta', num_labels=2)
 75 |     model = model.to(device)
 76 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 77 | 
 78 |     attack = True
 79 | 
 80 |     if attack:
 81 |         # at = FGM(model)
 82 |         at = FreeAT(model)
 83 | 
 84 |     def adversarial(data):
 85 |         optimizer.zero_grad()
 86 |         # 添加扰动
 87 |         at.attack(emb_name='embeddings.word_embeddings.weight')
 88 |         # 重新计算梯度
 89 |         adv_loss = model(input_ids=data['input_ids'].to(device),
 90 |                          attention_mask=data['attention_mask'].to(device),
 91 |                          labels=data['labels'].to(device)).loss
 92 |         # bp得到新的梯度
 93 |         adv_loss.backward()
 94 |         at.restore(emb_name='embeddings.word_embeddings.weight')
 95 | 
 96 |     def adversarial_free(data, m=3):
 97 |         # 备份梯度
 98 |         at.backup_grad()
 99 |         for i in range(m):
100 |             at.attack(emb_name='embeddings.word_embeddings.weight', first_attack=i == 0)
101 |             if i == 0:
102 |                 optimizer.zero_grad()
103 |             else:
104 |                 at.restore_grad()
105 |             # fp
106 |             adv_loss = model(input_ids=data['input_ids'].to(device),
107 |                              attention_mask=data['attention_mask'].to(device),
108 |                              labels=data['labels'].to(device)).loss
109 |             # bp得到新的梯度
110 |             adv_loss.backward()
111 |         at.restore(emb_name='embeddings.word_embeddings.weight')
112 | 
113 |     for epoch in range(5):
114 |         print('epoch:', epoch + 1)
115 |         pred = []
116 |         label = []
117 |         pbar = tqdm(train_data_loader)
118 |         for data in pbar:
119 |             optimizer.zero_grad()
120 | 
121 |             input_ids = data['input_ids'].to(device)
122 |             attention_mask = data['attention_mask'].to(device)
123 |             labels = data['labels'].to(device).long()
124 | 
125 |             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
126 |             output = outputs.logits.argmax(1).cpu().numpy()
127 |             pred.extend(output)
128 |             label.extend(labels.cpu().numpy())
129 |             loss = outputs.loss
130 |             loss.backward()
131 | 
132 |             if attack:
133 |                 # adversarial(data)
134 |                 adversarial_free(data)
135 | 
136 |             optimizer.step()
137 | 
138 |             pbar.update()
139 |             pbar.set_description(f'loss:{loss.item():.4f}')
140 | 
141 |         acc = accuracy_score(pred, label)
142 |         print('train acc:', acc)
143 | 
144 |         pred = []
145 |         label = []
146 |         for data in tqdm(dev_data_loader):
147 |             input_ids = data['input_ids'].to(device)
148 |             attention_mask = data['attention_mask'].to(device)
149 |             labels = data['labels'].to(device).long()
150 |             with torch.no_grad():
151 |                 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
152 |             output = outputs.logits.argmax(1).cpu().numpy()
153 |             pred.extend(output)
154 |             label.extend(labels.cpu().numpy())
155 |         acc = accuracy_score(pred, label)
156 |         print('dev acc:', acc)
157 |         print()
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     train()
162 | 


--------------------------------------------------------------------------------
/adversarial_training/lstm_at.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import jieba
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | from torch.utils.data import DataLoader, TensorDataset
  7 | from sklearn.metrics import accuracy_score
  8 | 
  9 | 
 10 | class TextCLS(torch.nn.Module):
 11 |     # 准备我们需要用到的参数和layer
 12 |     def __init__(self,
 13 |                  vocab_size,
 14 |                  embedding_size):
 15 |         super().__init__()
 16 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 17 |         # [batch_size, seq_len, hidden_size]
 18 |         self.lstm = nn.LSTM(input_size=embedding_size,
 19 |                             hidden_size=256,
 20 |                             num_layers=2,
 21 |                             batch_first=True)
 22 |         self.dense1 = nn.Linear(256, 100)
 23 |         self.dense2 = nn.Linear(100, 2)
 24 | 
 25 |     # 前向传播，那我们准备好的layer拼接在一起
 26 |     def forward(self, x):
 27 |         embedding = self.embedding(x)
 28 |         # [batch_size, seq_len, hidden_size]
 29 |         out, _ = self.lstm(embedding)
 30 |         out = self.dense1(out[:, -1, :])
 31 |         out = self.dense2(out)
 32 |         return out
 33 | 
 34 | 
 35 | def tokenize(string):
 36 |     res = list(jieba.cut(string, cut_all=False))
 37 |     return res
 38 | 
 39 | 
 40 | # 把数据转换成index
 41 | def seq2index(seq, vocab):
 42 |     seg = tokenize(seq)
 43 |     seg_index = []
 44 |     for s in seg:
 45 |         seg_index.append(vocab.get(s, 1))
 46 |     return seg_index
 47 | 
 48 | 
 49 | # 统一长度
 50 | def padding_seq(X, max_len=10):
 51 |     return np.array([
 52 |         np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
 53 |     ])
 54 | 
 55 | 
 56 | def load_data(batch_size=32):
 57 |     train_text = []
 58 |     train_label = []
 59 |     with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file:
 60 |         for line in file.readlines():
 61 |             t, l = line.strip().split('\t')
 62 |             train_text.append(t)
 63 |             train_label.append(int(l))
 64 | 
 65 |     dev_text = []
 66 |     dev_label = []
 67 |     with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file:
 68 |         for line in file.readlines():
 69 |             t, l = line.strip().split('\t')
 70 |             dev_text.append(t)
 71 |             dev_label.append(int(l))
 72 | 
 73 |     # 生成词典
 74 |     segment = [tokenize(t) for t in train_text]
 75 | 
 76 |     word_frequency = defaultdict(int)
 77 |     for row in segment:
 78 |         for i in row:
 79 |             word_frequency[i] += 1
 80 | 
 81 |     word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序
 82 | 
 83 |     vocab = {'[PAD]': 0, '[UNK]': 1}
 84 |     for d in word_sort:
 85 |         vocab[d[0]] = len(vocab)
 86 | 
 87 |     train_x = padding_seq([seq2index(t, vocab) for t in train_text])
 88 |     train_y = np.array(train_label)
 89 |     train_data_set = TensorDataset(torch.from_numpy(train_x),
 90 |                                    torch.from_numpy(train_y))
 91 |     train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size)
 92 | 
 93 |     dev_x = padding_seq([seq2index(t, vocab) for t in dev_text])
 94 |     dev_y = np.array(dev_label)
 95 |     dev_data_set = TensorDataset(torch.from_numpy(dev_x),
 96 |                                  torch.from_numpy(dev_y))
 97 |     dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size)
 98 | 
 99 |     return train_data_loader, dev_data_loader, vocab
100 | 
101 | 
102 | # 训练模型
103 | def train():
104 |     train_data_loader, dev_data_loader, vocab = load_data(128)
105 |     model = TextCLS(vocab_size=len(vocab),
106 |                     embedding_size=100)
107 | 
108 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
109 |     loss_func = nn.CrossEntropyLoss()
110 | 
111 |     if torch.cuda.is_available():
112 |         model = model.cuda()
113 | 
114 |     backup = {}
115 |     epsilon = 1
116 |     attack = False
117 | 
118 |     for epoch in range(5):
119 |         print('epoch:', epoch + 1)
120 |         pred = []
121 |         label = []
122 |         for step, (b_x, b_y) in enumerate(train_data_loader):
123 |             optimizer.zero_grad()
124 |             if torch.cuda.is_available():
125 |                 b_x = b_x.cuda().long()
126 |                 b_y = b_y.cuda().long()
127 |             output = model(b_x)
128 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
129 |             label.extend(b_y.cpu().numpy())
130 |             loss = loss_func(output, b_y)
131 |             loss.backward()
132 | 
133 |             if attack:
134 |                 # 备份参数，添加扰动
135 |                 for name, param in model.embedding.named_parameters():
136 |                     backup[name] = param.data.clone()
137 |                     norm = torch.norm(param.grad)
138 |                     if norm != 0 and not torch.isnan(norm):
139 |                         r_at = epsilon * param.grad / norm
140 |                         param.data.add_(r_at)
141 | 
142 |                 # 第二次fp与bp
143 |                 optimizer.zero_grad()
144 |                 output = model(b_x)
145 |                 loss = loss_func(output, b_y)
146 |                 loss.backward()
147 | 
148 |                 # 恢复参数
149 |                 for name, param in model.embedding.named_parameters():
150 |                     param.data = backup[name]
151 |                     backup = {}
152 | 
153 |             # 更新权重
154 |             optimizer.step()
155 |         acc = accuracy_score(pred, label)
156 |         print('train acc:', acc)
157 | 
158 |         pred = []
159 |         label = []
160 |         for step, (b_x, b_y) in enumerate(dev_data_loader):
161 |             if torch.cuda.is_available():
162 |                 b_x = b_x.cuda().long()
163 |                 b_y = b_y.cuda().long()
164 |             with torch.no_grad():
165 |                 output = model(b_x)
166 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
167 |             label.extend(b_y.cpu().numpy())
168 |         acc = accuracy_score(pred, label)
169 |         print('dev acc:', acc)
170 |         print()
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     train()
175 | 


--------------------------------------------------------------------------------
/data_augmentation/bert_mixup.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import accuracy_score
  2 | from transformers import BertForSequenceClassification, BertTokenizer
  3 | from torch.utils.data import DataLoader, Dataset
  4 | from tqdm import tqdm
  5 | from utils import fix_seed
  6 | import torch
  7 | import pandas as pd
  8 | from feature_augmentation import MixUp
  9 | 
 10 | path = 'E:\\ptm\\roberta'
 11 | tokenizer = BertTokenizer.from_pretrained(path)
 12 | 
 13 | 
 14 | class BaseDataset(Dataset):
 15 |     def __init__(self, encodings, labels=None):
 16 |         self.encodings = encodings
 17 |         self.labels = labels
 18 | 
 19 |     def __getitem__(self, idx):
 20 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 21 |         if self.labels is not None:
 22 |             item['labels'] = torch.tensor(self.labels[idx])
 23 |         return item
 24 | 
 25 |     def __len__(self):
 26 |         return len(self.encodings['input_ids'])
 27 | 
 28 | 
 29 | def load_data(batch_size=32):
 30 |     train_df = pd.read_csv('../data/tnews_public/train.csv')
 31 |     train_text = train_df['text'].tolist()
 32 |     train_label = train_df['label'].tolist()
 33 |     train_text = tokenizer(text=train_text,
 34 |                            return_tensors='pt',
 35 |                            truncation=True,
 36 |                            padding=True,
 37 |                            max_length=32)
 38 |     train_loader = DataLoader(BaseDataset(train_text, train_label),
 39 |                               batch_size,
 40 |                               pin_memory=True if torch.cuda.is_available() else False,
 41 |                               shuffle=False)
 42 | 
 43 |     dev_df = pd.read_csv('../data/tnews_public/dev.csv')
 44 |     dev_text = dev_df['text'].tolist()
 45 |     dev_label = dev_df['label'].tolist()
 46 |     dev_text = tokenizer(text=dev_text,
 47 |                          return_tensors='pt',
 48 |                          truncation=True,
 49 |                          padding=True,
 50 |                          max_length=32)
 51 | 
 52 |     dev_loader = DataLoader(BaseDataset(dev_text, dev_label),
 53 |                             batch_size,
 54 |                             pin_memory=True if torch.cuda.is_available() else False,
 55 |                             shuffle=False)
 56 | 
 57 |     return train_loader, dev_loader
 58 | 
 59 | 
 60 | # 训练模型
 61 | def train():
 62 |     fix_seed()
 63 | 
 64 |     train_data_loader, dev_data_loader = load_data(128)
 65 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 66 | 
 67 |     model = BertForSequenceClassification.from_pretrained(path, num_labels=4)
 68 |     model = model.to(device)
 69 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 70 | 
 71 |     mixup = MixUp(model, tokenizer, 4)
 72 | 
 73 |     best_acc = 0
 74 |     for epoch in range(5):
 75 |         print('epoch:', epoch + 1)
 76 |         pred = []
 77 |         label = []
 78 |         pbar = tqdm(train_data_loader)
 79 |         for data in pbar:
 80 |             # zero_grad，backward之后梯度都会进行累加
 81 |             optimizer.zero_grad()
 82 | 
 83 |             input_ids = data['input_ids'].to(device)
 84 |             attention_mask = data['attention_mask'].to(device)
 85 |             labels = data['labels'].to(device).long()
 86 | 
 87 |             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
 88 |             output = outputs.logits.argmax(1).cpu().numpy()
 89 |             pred.extend(output)
 90 |             label.extend(labels.cpu().numpy())
 91 |             loss = outputs.loss / 2
 92 |             loss.backward()
 93 | 
 94 |             mix_loss = mixup.augmentation(data) / 2
 95 |             mix_loss.backward()
 96 | 
 97 |             optimizer.step()
 98 | 
 99 |             pbar.update()
100 |             pbar.set_description(f'loss:{loss.item():.4f}')
101 | 
102 |         acc = accuracy_score(pred, label)
103 |         print('train acc:', acc)
104 | 
105 |         pred = []
106 |         label = []
107 |         for data in tqdm(dev_data_loader):
108 |             input_ids = data['input_ids'].to(device)
109 |             attention_mask = data['attention_mask'].to(device)
110 |             labels = data['labels'].to(device).long()
111 |             with torch.no_grad():
112 |                 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
113 |             output = outputs.logits.argmax(1).cpu().numpy()
114 |             pred.extend(output)
115 |             label.extend(labels.cpu().numpy())
116 |         acc = accuracy_score(pred, label)
117 |         print('dev acc:', acc)
118 |         print()
119 |         if acc > best_acc:
120 |             torch.save(model.state_dict(), 'teacher.bin')
121 |             best_acc = acc
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     train()
126 | 


--------------------------------------------------------------------------------
/data_augmentation/data_augmentation.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import torch
  3 | from transformers import BertTokenizer, BertForMaskedLM
  4 | from utils import get_device, punctuation
  5 | import jieba
  6 | 
  7 | 
  8 | class EDA:
  9 |     """
 10 |     替换同意词、插入同意词、交换词的顺序、删除词
 11 |     """
 12 | 
 13 |     def __init__(self):
 14 |         import synonyms
 15 |         self.synonyms = synonyms
 16 |         self.stop_words = synonyms.synonyms._stopwords
 17 |         self.word_dic = {}
 18 | 
 19 |     def augmentation(self, text, rate=0.3):
 20 | 
 21 |         replace_text = self.replace(text, rate=rate)
 22 |         insert_text = self.insert(text, rate=rate)
 23 |         swap_text = self.swap(text)
 24 |         delete_text = self.delete(text, rate=rate)
 25 |         all_text = list({text, replace_text, insert_text, swap_text, delete_text})
 26 |         return all_text
 27 | 
 28 |     def replace(self, text, rate=0.2):
 29 |         segment = list(jieba.cut(text))
 30 |         words_index = []
 31 |         for i, s in enumerate(segment):
 32 |             if s not in self.stop_words:
 33 |                 words_index.append(i)
 34 |         if not words_index:
 35 |             return text
 36 |         num = max(1, round(len(words_index) * rate))
 37 |         index = [random.choice(words_index) for _ in range(num)]
 38 | 
 39 |         for i in index:
 40 |             try:
 41 |                 if segment[i] in self.word_dic.keys():
 42 |                     segment[i] = self.word_dic[segment[i]]
 43 |                 else:
 44 |                     new_word = self.synonyms.nearby(segment[i])[0][1]
 45 |                     segment[i] = new_word
 46 |             except:
 47 |                 pass
 48 | 
 49 |         return ''.join(segment)
 50 | 
 51 |     def insert(self, text, rate=0.2):
 52 |         segment = list(jieba.cut(text))
 53 |         words_index = []
 54 |         for i, s in enumerate(segment):
 55 |             if s not in self.stop_words:
 56 |                 words_index.append(i)
 57 |         if not words_index:
 58 |             return text
 59 |         num = max(1, round(len(words_index) * rate))
 60 |         index = [random.choice(words_index) for _ in range(num)]
 61 | 
 62 |         for i in index:
 63 |             try:
 64 |                 if segment[i] in self.word_dic.keys():
 65 |                     segment[i] = self.word_dic[segment[i]]
 66 |                 else:
 67 |                     new_word = self.synonyms.nearby(segment[i])[0][1]
 68 |                     segment[i] = new_word
 69 |             except:
 70 |                 pass
 71 | 
 72 |         return ''.join(segment)
 73 | 
 74 |     def swap(self, text):
 75 |         segment = list(jieba.cut(text))
 76 |         if len(segment) <= 2:
 77 |             return text
 78 | 
 79 |         choice_word = [1, 1]
 80 |         while choice_word[0] == choice_word[1]:
 81 |             choice_word = random.choices(segment, k=2)
 82 | 
 83 |         segment[segment.index(choice_word[0])] = choice_word[1]
 84 |         segment[segment.index(choice_word[1])] = choice_word[0]
 85 | 
 86 |         return ''.join(segment)
 87 | 
 88 |     def delete(self, text, rate=0.2):
 89 |         segment = list(jieba.cut(text))
 90 |         for i in range(len(segment)):
 91 |             if random.random() < rate:
 92 |                 segment[i] = ''
 93 |         return ''.join(segment)
 94 | 
 95 | 
 96 | class AEDA:
 97 |     """
 98 |     随机添加标点
 99 |     https://arxiv.org/pdf/2108.13230.pdf
100 |     """
101 | 
102 |     def __init__(self):
103 |         self.punctuation = punctuation()
104 | 
105 |     def augmentation(self, text):
106 |         length = int(len(text) * 0.3)
107 |         if length < 2:
108 |             return text
109 |         punc_len = random.randint(1, length)
110 |         puncs = random.choices(self.punctuation, k=punc_len)
111 |         text = list(text)
112 |         for p in puncs:
113 |             text.insert(random.randint(0, len(text) - 1), p)
114 |         return ''.join(text)
115 | 
116 | 
117 | class BackTranslation:
118 |     """
119 |     回译
120 |     """
121 | 
122 |     def __init__(self):
123 |         from deepl import DeepL
124 |         self.deep = DeepL()
125 | 
126 |     def augmentation(self, text):
127 |         english = self.deep.translate('zh', 'en', text)
128 |         translate = self.deep.translate('en', 'zh', english)
129 |         return translate
130 | 
131 | 
132 | class WoTokenizer(BertTokenizer):
133 |     def __init__(self, pre_tokenizer=lambda x: jieba.cut(x, HMM=False), *args, **kwargs):
134 |         super().__init__(*args, **kwargs)
135 |         self.pre_tokenizer = pre_tokenizer
136 | 
137 |     def _tokenize(self, text, *arg, **kwargs):
138 |         split_tokens = []
139 |         for word in self.pre_tokenizer(text):
140 |             if word in self.vocab:
141 |                 split_tokens.append(word)
142 |             else:
143 |                 split_tokens.extend(super()._tokenize(word))
144 |         return split_tokens
145 | 
146 | 
147 | class LMAug:
148 |     """
149 |     基于mlm的数据增强，这里使用了wobert，对词mask
150 |     """
151 | 
152 |     def __init__(self):
153 |         model_path = 'E:\\ptm\\wobert'
154 |         self.tokenizer = WoTokenizer.from_pretrained(model_path)
155 |         self.model = BertForMaskedLM.from_pretrained(model_path).eval().to(get_device())
156 | 
157 |     def augmentation(self, text, topk=3):
158 |         input_ids = self.tokenizer(text, return_tensors='pt')['input_ids'][0]
159 |         random_index = random.randint(1, len(input_ids) - 2)
160 |         input_ids[random_index] = 103
161 |         # mask_text = ''.join(segment)
162 |         #
163 |         # input_ids = self.tokenizer(mask_text, return_tensors='pt', max_length=512)['input_ids'].to(get_device())
164 |         mask_index = [i for i, d in enumerate(input_ids) if d == 103]
165 |         mask = input_ids == 103
166 | 
167 |         res = self.model(input_ids[None, :].to(get_device())).logits[0][mask]
168 |         sort_res = torch.argsort(res, dim=1, descending=True)
169 |         index = sort_res[:, 0:topk]
170 | 
171 |         out_text = []
172 |         for idx in index.T:
173 |             new_input_ids = input_ids
174 |             for i, m_idx in zip(idx, mask_index):
175 |                 new_input_ids[m_idx] = i
176 |             text = self.tokenizer.convert_ids_to_tokens(new_input_ids)
177 |             text = ''.join(text[1:-1]).replace('#', '')
178 |             out_text.append(text)
179 |         return out_text
180 | 
181 | 
182 | class Augmentation:
183 |     """
184 |     数据增强
185 |     前4条是EDA，第5条是AEDA，第6条是回译，第7-9条是MLM，最后一条是GPT
186 |     """
187 | 
188 |     def __init__(self, use_br=False, aug_list=None):
189 |         if aug_list is None:
190 |             aug_list = [
191 |                 EDA(),
192 |                 AEDA(),
193 |                 LMAug()
194 |             ]
195 |         if use_br:
196 |             aug_list.append(BackTranslation())
197 |         self.aug_list = aug_list
198 | 
199 |     def augmentation(self, text):
200 |         aug_text = []
201 |         for aug in self.aug_list:
202 |             text_res = aug.augmentation(text)
203 |             if isinstance(text_res, str):
204 |                 aug_text.append(text_res)
205 |             elif isinstance(text_res, list):
206 |                 aug_text.extend(text_res)
207 | 
208 |         return aug_text
209 | 
210 | 
211 | if __name__ == '__main__':
212 |     # print(EDA().augmentation('今天天气真好啊'))
213 |     # print(AEDA().augmentation('今天天气真好啊'))
214 |     # print(BackTranslation().augmentation('今天天气真好啊'))
215 |     print(LMAug().augmentation('今天天气真好啊', topk=5))
216 | 


--------------------------------------------------------------------------------
/data_augmentation/deepl.py:
--------------------------------------------------------------------------------
 1 | from webdriver_manager.chrome import ChromeDriverManager
 2 | from selenium import webdriver
 3 | from selenium.webdriver.chrome.options import Options
 4 | import time
 5 | from bs4 import BeautifulSoup
 6 | import urllib.parse
 7 | 
 8 | 
 9 | class DeepL:
10 |     def __init__(self):
11 |         options = Options()
12 |         options.add_argument('--headless')
13 | 
14 |         self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
15 | 
16 |     def translate(self, from_lang: str, to_lang: str, from_text: str) -> str:
17 |         sleep_time = 1
18 |         from_text = urllib.parse.quote(from_text)
19 |         url = 'https://www.deepl.com/translator#' \
20 |               + from_lang + '/' + to_lang + '/' + from_text
21 |         self.driver.get(url)
22 |         self.driver.implicitly_wait(10)
23 |         to_text = None
24 |         for i in range(30):
25 |             time.sleep(sleep_time)
26 |             html = self.driver.page_source
27 |             to_text = self.get_text_from_page_source(html)
28 | 
29 |             if to_text:
30 |                 break
31 |         return to_text
32 | 
33 |     def get_text_from_page_source(self, html: str) -> str:
34 |         soup = BeautifulSoup(html, features='html.parser')
35 |         target_elem = soup.find(class_="lmt__translations_as_text__text_btn")
36 |         text = None
37 |         if target_elem is not None:
38 |             text = target_elem.text
39 |         return text
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     content = """
44 |     We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.
45 |     BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).
46 |     """
47 |     res = DeepL().translate('en', 'zh', content)
48 |     print(res)
49 | 


--------------------------------------------------------------------------------
/data_augmentation/feature.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | import numpy as np
 4 | from utils import get_device, one_hot
 5 | 
 6 | 
 7 | class MixUp:
 8 |     """
 9 |     1、正常训练，bp
10 |     2、mixup训练，bp
11 |     3、梯度累加，更新权重
12 |     """
13 | 
14 |     def __init__(self, model, tokenizer, num_labels, layer='embedding'):
15 |         self.tokenizer = tokenizer
16 |         self.model = model
17 |         self.device = get_device()
18 |         self.num_labels = num_labels
19 |         self.layer = layer
20 | 
21 |     def cross_entropy(self, logits, label):
22 |         exp_logits = torch.exp(logits)
23 |         log_prob = logits - torch.log(torch.sum(exp_logits, dim=1, keepdim=True))
24 |         return -torch.mean(torch.sum(log_prob * label, dim=1))
25 | 
26 |     def augmentation(self, data):
27 |         # 这里的data是一个batch的数据
28 |         input_ids = data['input_ids'].to(self.device)
29 |         attention_mask = data['attention_mask'].to(self.device)
30 |         label = data['labels'].to(self.device).long()
31 | 
32 |         # 把batch内的数据打乱 4
33 |         batch_size = len(data)
34 |         # [1, 3, 2, 0]
35 |         index = torch.randperm(batch_size).to(self.device)
36 |         lam = np.random.beta(0.5, 0.5)
37 | 
38 |         label_mix = one_hot(label, self.num_labels) * lam + one_hot(label[index], self.num_labels) * (1 - lam)
39 | 
40 |         def my_hook(module, inputs, outputs):
41 |             x_mix = outputs * lam + outputs[index] * (1 - lam)
42 |             return x_mix
43 | 
44 |         # pytorch的钩子
45 |         hook = self.model.bert.embeddings.register_forward_hook(my_hook)
46 | 
47 |         outputs = self.model(input_ids, attention_mask)
48 |         logits = outputs.logits
49 |         hook.remove()
50 | 
51 |         loss = self.cross_entropy(logits, label_mix)
52 |         return loss
53 | 


--------------------------------------------------------------------------------
/data_augmentation/feature_augmentation.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | import numpy as np
 4 | from utils import get_device, one_hot
 5 | 
 6 | 
 7 | class MixUp:
 8 | 
 9 |     def __init__(self, model, tokenizer, num_labels, layer='embedding'):
10 |         self.tokenizer = tokenizer
11 |         self.model = model
12 |         self.device = get_device()
13 |         self.num_labels = num_labels
14 |         self.layer = layer
15 | 
16 |     def cross_entropy(self, logits, labels):
17 |         exp_logits = torch.exp(logits)
18 |         log_prob = logits - torch.log(torch.sum(exp_logits, dim=1, keepdim=True))
19 |         return -torch.mean(torch.sum(log_prob * labels, dim=1))
20 | 
21 |     def augmentation(self, data):
22 |         input_ids = data['input_ids'].to(self.device)
23 |         attention_mask = data['attention_mask'].to(self.device)
24 |         label = data['labels'].to(self.device).long()
25 | 
26 |         batch_size = len(input_ids)
27 |         # 打乱顺序用于计算mix embedding
28 |         index = torch.randperm(batch_size).to(self.device)
29 |         lam = np.random.beta(0.5, 0.5)
30 | 
31 |         label_mix = one_hot(label, self.num_labels) * lam + one_hot(label[index], self.num_labels) * (1 - lam)
32 |         hook = None
33 | 
34 |         def single_forward_hook(module, inputs, outputs):
35 |             mix_input = outputs * lam + outputs[index] * (1 - lam)
36 |             return mix_input
37 | 
38 |         def multi_forward_hook(module, inputs, outputs):
39 |             mix_input = outputs[0] * lam + outputs[0][index] * (1 - lam)
40 |             return tuple([mix_input])
41 | 
42 |         if self.layer == 'embedding':
43 |             hook = self.model.bert.embeddings.register_forward_hook(single_forward_hook)
44 |         elif self.layer == 'pooler':
45 |             hook = self.model.bert.pooler.register_forward_hook(single_forward_hook)
46 |         elif self.layer == 'inner':
47 |             # 随机选一层
48 |             layer_num = random.randint(1, self.model.config.num_hidden_layers) - 1
49 |             hook = self.model.bert.encoder.layer[layer_num].register_forward_hook(multi_forward_hook)
50 | 
51 |         outputs = self.model(input_ids=input_ids,
52 |                              attention_mask=attention_mask,
53 |                              labels=label.to(self.device))
54 |         logits = outputs.logits
55 |         hook.remove()
56 | 
57 |         # 计算loss
58 |         loss = self.cross_entropy(logits, label_mix)
59 |         return loss
60 | 


--------------------------------------------------------------------------------
/distillation/distillation_student.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from sklearn.metrics import accuracy_score
  3 | import pandas as pd
  4 | from torch.utils.data import DataLoader, Dataset
  5 | from tqdm import tqdm
  6 | from utils import fix_seed
  7 | import torch
  8 | from transformers import BertForSequenceClassification, BertTokenizer
  9 | from torch import softmax
 10 | 
 11 | path = 'E:\\ptm\\roberta'
 12 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 13 | tokenizer = BertTokenizer.from_pretrained(path)
 14 | teacher = BertForSequenceClassification.from_pretrained(path, num_labels=4)
 15 | # 在线蒸馏，不加载老师的权重
 16 | teacher.load_state_dict(torch.load('teacher.bin', map_location=device))
 17 | teacher = teacher.to(device)
 18 | teacher.eval()
 19 | 
 20 | 
 21 | class TextCLS(torch.nn.Module):
 22 |     # 准备我们需要用到的参数和layer
 23 |     def __init__(self,
 24 |                  embedding_size,
 25 |                  vocab_size=21128):
 26 |         super().__init__()
 27 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 28 |         # [batch_size, seq_len, hidden_size]
 29 |         self.lstm = nn.LSTM(input_size=embedding_size,
 30 |                             hidden_size=256,
 31 |                             num_layers=2,
 32 |                             batch_first=True)
 33 |         self.dense1 = nn.Linear(256, 100)
 34 |         self.dense2 = nn.Linear(100, 4)
 35 | 
 36 |     # 前向传播，那我们准备好的layer拼接在一起
 37 |     def forward(self, x):
 38 |         embedding = self.embedding(x)
 39 |         # [batch_size, seq_len, hidden_size]
 40 |         out, _ = self.lstm(embedding)
 41 |         # 计算mask的和  index = sum(mask)-1
 42 |         # out[:, index, :]
 43 |         out = self.dense1(out[:, -1, :])
 44 |         out = self.dense2(out)
 45 |         return out
 46 | 
 47 | 
 48 | class BaseDataset(Dataset):
 49 |     def __init__(self, encodings, labels=None):
 50 |         self.encodings = encodings
 51 |         self.labels = labels
 52 | 
 53 |     def __getitem__(self, idx):
 54 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 55 |         if self.labels is not None:
 56 |             item['labels'] = torch.tensor(self.labels[idx])
 57 |         return item
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.encodings['input_ids'])
 61 | 
 62 | 
 63 | def load_data(batch_size=32):
 64 |     train_df = pd.read_csv('../data/tnews_public/train.csv')
 65 |     train_text = train_df['text'].tolist()
 66 |     train_label = train_df['label'].tolist()
 67 |     train_text = tokenizer(text=train_text,
 68 |                            return_tensors='pt',
 69 |                            truncation=True,
 70 |                            padding=True,
 71 |                            max_length=20)
 72 |     train_loader = DataLoader(BaseDataset(train_text, train_label),
 73 |                               batch_size,
 74 |                               pin_memory=True if torch.cuda.is_available() else False,
 75 |                               shuffle=False)
 76 | 
 77 |     dev_df = pd.read_csv('../data/tnews_public/dev.csv')
 78 |     dev_text = dev_df['text'].tolist()
 79 |     dev_label = dev_df['label'].tolist()
 80 |     dev_text = tokenizer(text=dev_text,
 81 |                          return_tensors='pt',
 82 |                          truncation=True,
 83 |                          padding=True,
 84 |                          max_length=20)
 85 |     dev_loader = DataLoader(BaseDataset(dev_text, dev_label),
 86 |                             batch_size,
 87 |                             pin_memory=True if torch.cuda.is_available() else False,
 88 |                             shuffle=False)
 89 | 
 90 |     return train_loader, dev_loader
 91 | 
 92 | 
 93 | def CE(pred, label, t=1):
 94 |     pred = softmax(pred / t, dim=-1)
 95 |     label = softmax(label / t, dim=-1)
 96 |     loss = -torch.sum(torch.log(pred) * label)
 97 |     return loss
 98 | 
 99 | 
100 | # 训练模型
101 | def train():
102 |     fix_seed()
103 | 
104 |     train_data_loader, dev_data_loader = load_data(64)
105 |     student = TextCLS(embedding_size=100)
106 |     student = student.to(device)
107 |     # 优化器要保留老师和学生模型的参数
108 |     optimizer = torch.optim.Adam(student.parameters(), lr=0.01)
109 |     loss_func = nn.CrossEntropyLoss()
110 | 
111 |     best_acc = 0
112 |     for epoch in range(10):
113 |         print('epoch:', epoch + 1)
114 |         pbar = tqdm(train_data_loader)
115 |         for data in pbar:
116 |             optimizer.zero_grad()
117 | 
118 |             input_ids = data['input_ids'].to(device)
119 |             attention_mask = data['attention_mask'].to(device)
120 |             labels = data['labels'].to(device).long()
121 | 
122 |             # 离线蒸馏
123 |             # hard target
124 |             # 学生模型学习真实的y标
125 |             output = student(input_ids)
126 |             loss1 = loss_func(output, labels)
127 | 
128 |             # soft target
129 |             # 学生模型学习老师模型的输出结果，提升学生模型的泛化能力
130 |             with torch.no_grad():
131 |                 outputs = teacher(input_ids, attention_mask=attention_mask, labels=labels)
132 |             # outputs = teacher(input_ids, attention_mask=attention_mask, labels=labels)
133 |             teacher_out = outputs.logits
134 |             loss2 = CE(output, teacher_out, t=2)
135 | 
136 |             # loss3 = loss2(teacher_out,labels)
137 | 
138 |             loss = loss1 + 0.25 * loss2
139 |             loss.backward()
140 | 
141 |             optimizer.step()
142 | 
143 |             pbar.update()
144 |             pbar.set_description(f'loss:{loss.item():.4f}')
145 | 
146 |         pred = []
147 |         label = []
148 |         for data in tqdm(dev_data_loader):
149 |             input_ids = data['input_ids'].to(device)
150 |             labels = data['labels'].to(device).long()
151 |             with torch.no_grad():
152 |                 output = student(input_ids)
153 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
154 |             label.extend(labels.cpu().numpy())
155 |         acc = accuracy_score(pred, label)
156 |         print('dev acc:', acc)
157 |         print()
158 |         if acc > best_acc:
159 |             torch.save(student.state_dict(), 'student.bin')
160 |             best_acc = acc
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     train()
165 | 


--------------------------------------------------------------------------------
/distillation/train_student.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from sklearn.metrics import accuracy_score
  3 | import pandas as pd
  4 | from torch.utils.data import DataLoader, Dataset
  5 | from tqdm import tqdm
  6 | from utils import fix_seed
  7 | import torch
  8 | from transformers import BertTokenizer
  9 | from torch import softmax
 10 | 
 11 | path = 'E:\\ptm\\roberta'
 12 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 13 | tokenizer = BertTokenizer.from_pretrained(path)
 14 | 
 15 | 
 16 | class TextCLS(torch.nn.Module):
 17 |     # 准备我们需要用到的参数和layer
 18 |     def __init__(self,
 19 |                  embedding_size,
 20 |                  vocab_size=21128):
 21 |         super().__init__()
 22 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 23 |         # [batch_size, seq_len, hidden_size]
 24 |         self.lstm = nn.LSTM(input_size=embedding_size,
 25 |                             hidden_size=256,
 26 |                             num_layers=2,
 27 |                             batch_first=True)
 28 |         self.dense1 = nn.Linear(256, 100)
 29 |         self.dense2 = nn.Linear(100, 4)
 30 | 
 31 |     # 前向传播，那我们准备好的layer拼接在一起
 32 |     def forward(self, x):
 33 |         embedding = self.embedding(x)
 34 |         # [batch_size, seq_len, hidden_size]
 35 |         out, _ = self.lstm(embedding)
 36 |         out = self.dense1(out[:, -1, :])
 37 |         out = self.dense2(out)
 38 |         return out
 39 | 
 40 | 
 41 | class BaseDataset(Dataset):
 42 |     def __init__(self, encodings, labels=None):
 43 |         self.encodings = encodings
 44 |         self.labels = labels
 45 | 
 46 |     def __getitem__(self, idx):
 47 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 48 |         if self.labels is not None:
 49 |             item['labels'] = torch.tensor(self.labels[idx])
 50 |         return item
 51 | 
 52 |     def __len__(self):
 53 |         return len(self.encodings['input_ids'])
 54 | 
 55 | 
 56 | def load_data(batch_size=32):
 57 |     train_df = pd.read_csv('../data/tnews_public/train.csv')
 58 |     train_text = train_df['text'].tolist()
 59 |     train_label = train_df['label'].tolist()
 60 |     train_text = tokenizer(text=train_text,
 61 |                            return_tensors='pt',
 62 |                            truncation=True,
 63 |                            padding=True,
 64 |                            max_length=32)
 65 |     train_loader = DataLoader(BaseDataset(train_text, train_label),
 66 |                               batch_size,
 67 |                               pin_memory=True if torch.cuda.is_available() else False,
 68 |                               shuffle=False)
 69 | 
 70 |     dev_df = pd.read_csv('../data/tnews_public/dev.csv')
 71 |     dev_text = dev_df['text'].tolist()
 72 |     dev_label = dev_df['label'].tolist()
 73 |     dev_text = tokenizer(text=dev_text,
 74 |                          return_tensors='pt',
 75 |                          truncation=True,
 76 |                          padding=True,
 77 |                          max_length=32)
 78 |     dev_loader = DataLoader(BaseDataset(dev_text, dev_label),
 79 |                             batch_size,
 80 |                             pin_memory=True if torch.cuda.is_available() else False,
 81 |                             shuffle=False)
 82 | 
 83 |     return train_loader, dev_loader
 84 | 
 85 | 
 86 | # 训练模型
 87 | def train():
 88 |     fix_seed()
 89 | 
 90 |     train_data_loader, dev_data_loader = load_data(32)
 91 |     student = TextCLS(embedding_size=100)
 92 |     student = student.to(device)
 93 |     optimizer = torch.optim.Adam(student.parameters(), lr=0.01)
 94 |     loss_func = nn.CrossEntropyLoss()
 95 | 
 96 |     best_acc = 0
 97 |     for epoch in range(20):
 98 |         print('epoch:', epoch + 1)
 99 |         pred = []
100 |         label = []
101 |         pbar = tqdm(train_data_loader)
102 |         for data in pbar:
103 |             optimizer.zero_grad()
104 | 
105 |             input_ids = data['input_ids'].to(device)
106 |             labels = data['labels'].to(device)
107 | 
108 |             output = student(input_ids)
109 |             loss = loss_func(output, labels)
110 | 
111 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
112 |             label.extend(labels)
113 |             loss.backward()
114 | 
115 |             optimizer.step()
116 | 
117 |             pbar.update()
118 |             pbar.set_description(f'loss:{loss.item():.4f}')
119 | 
120 |         pred = []
121 |         label = []
122 |         for data in tqdm(dev_data_loader):
123 |             input_ids = data['input_ids'].to(device)
124 |             labels = data['labels'].to(device).long()
125 |             with torch.no_grad():
126 |                 output = student(input_ids)
127 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
128 |             label.extend(labels.cpu().numpy())
129 |         acc = accuracy_score(pred, label)
130 |         print('dev acc:', acc)
131 |         print()
132 |         if acc > best_acc:
133 |             torch.save(student.state_dict(), 'model.bin')
134 |             best_acc = acc
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     train()
139 | 


--------------------------------------------------------------------------------
/distillation/train_teacher.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import accuracy_score
  2 | from transformers import BertForSequenceClassification, BertTokenizer
  3 | from torch.utils.data import DataLoader, Dataset
  4 | from tqdm import tqdm
  5 | from utils import fix_seed
  6 | import torch
  7 | import pandas as pd
  8 | 
  9 | path = 'E:\\ptm\\roberta'
 10 | tokenizer = BertTokenizer.from_pretrained(path)
 11 | 
 12 | 
 13 | class BaseDataset(Dataset):
 14 |     def __init__(self, encodings, labels=None):
 15 |         self.encodings = encodings
 16 |         self.labels = labels
 17 | 
 18 |     def __getitem__(self, idx):
 19 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 20 |         if self.labels is not None:
 21 |             item['labels'] = torch.tensor(self.labels[idx])
 22 |         return item
 23 | 
 24 |     def __len__(self):
 25 |         return len(self.encodings['input_ids'])
 26 | 
 27 | 
 28 | def load_data(batch_size=32):
 29 |     train_df = pd.read_csv('../data/tnews_public/train.csv')
 30 |     train_text = train_df['text'].tolist()
 31 |     train_label = train_df['label'].tolist()
 32 |     train_text = tokenizer(text=train_text,
 33 |                            return_tensors='pt',
 34 |                            truncation=True,
 35 |                            padding=True,
 36 |                            max_length=32)
 37 |     train_loader = DataLoader(BaseDataset(train_text, train_label),
 38 |                               batch_size,
 39 |                               pin_memory=True if torch.cuda.is_available() else False,
 40 |                               shuffle=False)
 41 | 
 42 |     dev_df = pd.read_csv('../data/tnews_public/dev.csv')
 43 |     dev_text = dev_df['text'].tolist()
 44 |     dev_label = dev_df['label'].tolist()
 45 |     dev_text = tokenizer(text=dev_text,
 46 |                          return_tensors='pt',
 47 |                          truncation=True,
 48 |                          padding=True,
 49 |                          max_length=32)
 50 | 
 51 |     dev_loader = DataLoader(BaseDataset(dev_text, dev_label),
 52 |                             batch_size,
 53 |                             pin_memory=True if torch.cuda.is_available() else False,
 54 |                             shuffle=False)
 55 | 
 56 |     return train_loader, dev_loader
 57 | 
 58 | 
 59 | # 训练模型
 60 | def train():
 61 |     fix_seed()
 62 | 
 63 |     train_data_loader, dev_data_loader = load_data(32)
 64 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 65 | 
 66 |     model = BertForSequenceClassification.from_pretrained(path, num_labels=4)
 67 |     model = model.to(device)
 68 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 69 | 
 70 |     best_acc = 0
 71 |     for epoch in range(5):
 72 |         print('epoch:', epoch + 1)
 73 |         pred = []
 74 |         label = []
 75 |         pbar = tqdm(train_data_loader)
 76 |         for data in pbar:
 77 |             optimizer.zero_grad()
 78 | 
 79 |             input_ids = data['input_ids'].to(device)
 80 |             attention_mask = data['attention_mask'].to(device)
 81 |             labels = data['labels'].to(device).long()
 82 | 
 83 |             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
 84 |             output = outputs.logits.argmax(1).cpu().numpy()
 85 |             pred.extend(output)
 86 |             label.extend(labels.cpu().numpy())
 87 |             loss = outputs.loss
 88 |             loss.backward()
 89 | 
 90 |             optimizer.step()
 91 | 
 92 |             pbar.update()
 93 |             pbar.set_description(f'loss:{loss.item():.4f}')
 94 | 
 95 |         acc = accuracy_score(pred, label)
 96 |         print('train acc:', acc)
 97 | 
 98 |         pred = []
 99 |         label = []
100 |         for data in tqdm(dev_data_loader):
101 |             input_ids = data['input_ids'].to(device)
102 |             attention_mask = data['attention_mask'].to(device)
103 |             labels = data['labels'].to(device).long()
104 |             with torch.no_grad():
105 |                 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
106 |             output = outputs.logits.argmax(1).cpu().numpy()
107 |             pred.extend(output)
108 |             label.extend(labels.cpu().numpy())
109 |         acc = accuracy_score(pred, label)
110 |         print('dev acc:', acc)
111 |         print()
112 |         if acc > best_acc:
113 |             torch.save(model.state_dict(), 'teacher.bin')
114 |             best_acc = acc
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     train()
119 | 


--------------------------------------------------------------------------------
/elmoformanylangs/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from .elmo import Embedder
 3 | 
 4 | 
 5 | import logging
 6 | logger = logging.getLogger('elmoformanylangs')
 7 | 
 8 | # if the client application hasn't set the log level, we set it
 9 | # ourselves to INFO
10 | if logger.level == 0:
11 |     logger.setLevel(logging.INFO)
12 | 
13 | log_handler = logging.StreamHandler()
14 | log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)s: %(message)s")
15 | log_handler.setFormatter(log_formatter)
16 | 
17 | # also, if the client hasn't added any handlers for this logger
18 | # (or a default handler), we add a handler of our own
19 | #
20 | # client can later do
21 | #   logger.removeHandler(stanza.log_handler)
22 | if not logger.hasHandlers():
23 |     logger.addHandler(log_handler)
24 | 


--------------------------------------------------------------------------------
/elmoformanylangs/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from __future__ import unicode_literals
  4 | import os
  5 | import sys
  6 | import codecs
  7 | import argparse
  8 | import logging
  9 | import json
 10 | import torch
 11 | from .modules.embedding_layer import EmbeddingLayer
 12 | from .utils import dict2namedtuple
 13 | from .frontend import Model
 14 | from .frontend import create_batches
 15 | import numpy as np
 16 | import h5py
 17 | 
 18 | logger = logging.getLogger('elmoformanylangs')
 19 | 
 20 | 
 21 | def read_corpus(path, max_chars=None):
 22 |   """
 23 |   read raw text file. The format of the input is like, one sentence per line
 24 |   words are separated by '\t'
 25 | 
 26 |   :param path:
 27 |   :param max_chars: int, the number of maximum characters in a word, this
 28 |     parameter is used when the ptm is configured with CNN word encoder.
 29 |   :return:
 30 |   """
 31 |   dataset = []
 32 |   textset = []
 33 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 34 |     for line in fin.read().strip().split('\n'):
 35 |       data = ['<bos>']
 36 |       text = []
 37 |       for token in line.split('\t'):
 38 |         text.append(token)
 39 |         if max_chars is not None and len(token) + 2 > max_chars:
 40 |           token = token[:max_chars - 2]
 41 |         data.append(token)
 42 |       data.append('<eos>')
 43 |       dataset.append(data)
 44 |       textset.append(text)
 45 |   return dataset, textset
 46 | 
 47 | 
 48 | def read_conll_corpus(path, max_chars=None):
 49 |   """
 50 |   read text in CoNLL-U format.
 51 | 
 52 |   :param path:
 53 |   :param max_chars:
 54 |   :return:
 55 |   """
 56 |   dataset = []
 57 |   textset = []
 58 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 59 |     for payload in fin.read().strip().split('\n\n'):
 60 |       data = ['<bos>']
 61 |       text = []
 62 |       lines = payload.splitlines()
 63 |       body = [line for line in lines if not line.startswith('#')]
 64 |       for line in body:
 65 |         fields = line.split('\t')
 66 |         num, token = fields[0], fields[1]
 67 |         if '-' in num or '.' in num:
 68 |           continue
 69 |         text.append(token)
 70 |         if max_chars is not None and len(token) + 2 > max_chars:
 71 |           token = token[:max_chars - 2]
 72 |         data.append(token)
 73 |       data.append('<eos>')
 74 |       dataset.append(data)
 75 |       textset.append(text)
 76 |   return dataset, textset
 77 | 
 78 | 
 79 | def read_conll_char_corpus(path, max_chars=None):
 80 |   """
 81 | 
 82 |   :param path:
 83 |   :param max_chars:
 84 |   :return:
 85 |   """
 86 |   dataset = []
 87 |   textset = []
 88 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 89 |     for payload in fin.read().strip().split('\n\n'):
 90 |       data = ['<bos>']
 91 |       text = []
 92 |       lines = payload.splitlines()
 93 |       body = [line for line in lines if not line.startswith('#')]
 94 |       for line in body:
 95 |         fields = line.split('\t')
 96 |         num, token = fields[0], fields[1]
 97 |         if '-' in num or '.' in num:
 98 |           continue
 99 |         for ch in token:
100 |           text.append(ch)
101 |           if max_chars is not None and len(ch) + 2 > max_chars:
102 |             ch = ch[:max_chars - 2]
103 |           data.append(ch)
104 |       data.append('<eos>')
105 |       dataset.append(data)
106 |       textset.append(text)
107 |   return dataset, textset
108 | 
109 | 
110 | def read_conll_char_vi_corpus(path, max_chars=None):
111 |   """
112 | 
113 |   :param path:
114 |   :param max_chars:
115 |   :return:
116 |   """
117 |   dataset = []
118 |   textset = []
119 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
120 |     for payload in fin.read().strip().split('\n\n'):
121 |       data = ['<bos>']
122 |       text = []
123 |       lines = payload.splitlines()
124 |       body = [line for line in lines if not line.startswith('#')]
125 |       for line in body:
126 |         fields = line.split('\t')
127 |         num, token = fields[0], fields[1]
128 |         if '-' in num or '.' in num:
129 |           continue
130 |         for ch in token.split():
131 |           text.append(ch)
132 |           if max_chars is not None and len(ch) + 2 > max_chars:
133 |             ch = ch[:max_chars - 2]
134 |           data.append(ch)
135 |       data.append('<eos>')
136 |       dataset.append(data)
137 |       textset.append(text)
138 |   return dataset, textset
139 | 
140 | 
141 | def test_main():
142 |   # Configurations
143 |   cmd = argparse.ArgumentParser('The testing components of')
144 |   cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
145 |   cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'),
146 |                    help='the input format.')
147 |   cmd.add_argument("--input", help="the path to the raw text file.")
148 |   cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).'
149 |                                                            ' Use comma to separate the format identifiers,'
150 |                                                            ' like \'--output_format=hdf5,plain\'')
151 |   cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of '
152 |                                            '<output_prefix>.<output_layer>.<output_format>')
153 |   cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM '
154 |                                           'hidden layer, 2 for the second LSTM hidden layer, -1 for an average'
155 |                                           'of 3 layers.')
156 |   cmd.add_argument("--ptm", required=True, help="the path to the ptm.")
157 |   cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
158 |   args = cmd.parse_args(sys.argv[2:])
159 | 
160 |   if args.gpu >= 0:
161 |     torch.cuda.set_device(args.gpu)
162 |   use_cuda = args.gpu >= 0 and torch.cuda.is_available()
163 |   # load the ptm configurations
164 |   args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8')))
165 | 
166 |   with open(os.path.join(args.model, args2.config_path), 'r') as fin:
167 |     config = json.load(fin)
168 | 
169 |   # For the ptm trained with character-based word encoder.
170 |   if config['token_embedder']['char_dim'] > 0:
171 |     char_lexicon = {}
172 |     with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
173 |       for line in fpi:
174 |         tokens = line.strip().split('\t')
175 |         if len(tokens) == 1:
176 |           tokens.insert(0, '\u3000')
177 |         token, i = tokens
178 |         char_lexicon[token] = int(i)
179 |     char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
180 |     logger.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
181 |   else:
182 |     char_lexicon = None
183 |     char_emb_layer = None
184 | 
185 |   # For the ptm trained with word form word encoder.
186 |   if config['token_embedder']['word_dim'] > 0:
187 |     word_lexicon = {}
188 |     with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
189 |       for line in fpi:
190 |         tokens = line.strip().split('\t')
191 |         if len(tokens) == 1:
192 |           tokens.insert(0, '\u3000')
193 |         token, i = tokens
194 |         word_lexicon[token] = int(i)
195 |     word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
196 |     logger.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
197 |   else:
198 |     word_lexicon = None
199 |     word_emb_layer = None
200 | 
201 |   # instantiate the ptm
202 |   model = Model(config, word_emb_layer, char_emb_layer, use_cuda)
203 | 
204 |   if use_cuda:
205 |     model.cuda()
206 | 
207 |   logger.info(str(model))
208 |   model.load_model(args.model)
209 | 
210 |   # read test data according to input format
211 |   read_function = read_corpus if args.input_format == 'plain' else (
212 |     read_conll_corpus if args.input_format == 'conll' else (
213 |       read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus))
214 | 
215 |   if config['token_embedder']['name'].lower() == 'cnn':
216 |     test, text = read_function(args.input, config['token_embedder']['max_characters_per_token'])
217 |   else:
218 |     test, text = read_function(args.input)
219 | 
220 |   # create test batches from the input data.
221 |   test_w, test_c, test_lens, test_masks, test_text = create_batches(
222 |     test, args.batch_size, word_lexicon, char_lexicon, config, text=text)
223 | 
224 |   # configure the ptm to evaluation mode.
225 |   model.eval()
226 | 
227 |   sent_set = set()
228 |   cnt = 0
229 | 
230 |   output_formats = args.output_format.split(',')
231 |   output_layers = map(int, args.output_layer.split(','))
232 | 
233 |   handlers = {}
234 |   for output_format in output_formats:
235 |     if output_format not in ('hdf5', 'txt'):
236 |       print('Unknown output_format: {0}'.format(output_format))
237 |       continue
238 |     for output_layer in output_layers:
239 |       filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format)
240 |       handlers[output_format, output_layer] = \
241 |         h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w')
242 | 
243 |   for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
244 |     output = model.forward(w, c, masks)
245 |     for i, text in enumerate(texts):
246 |       sent = '\t'.join(text)
247 |       sent = sent.replace('.', '$period$')
248 |       sent = sent.replace('/', '$backslash$')
249 |       if sent in sent_set:
250 |         continue
251 |       sent_set.add(sent)
252 |       if config['encoder']['name'].lower() == 'lstm':
253 |         data = output[i, 1:lens[i]-1, :].data
254 |         if use_cuda:
255 |           data = data.cpu()
256 |         data = data.numpy()
257 |       elif config['encoder']['name'].lower() == 'elmo':
258 |         data = output[:, i, 1:lens[i]-1, :].data
259 |         if use_cuda:
260 |           data = data.cpu()
261 |         data = data.numpy()
262 | 
263 |       for (output_format, output_layer) in handlers:
264 |         fout = handlers[output_format, output_layer]
265 |         if output_layer == -1:
266 |           payload = np.average(data, axis=0)
267 |         else:
268 |           payload = data[output_layer]
269 |         if output_format == 'hdf5':
270 |           fout.create_dataset(sent, payload.shape, dtype='float32', data=payload)
271 |         else:
272 |           for word, row in zip(text, payload):
273 |             print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout)
274 |           print('', file=fout)
275 | 
276 |       cnt += 1
277 |       if cnt % 1000 == 0:
278 |         logger.info('Finished {0} sentences.'.format(cnt))
279 |   for _, handler in handlers.items():
280 |     handler.close()
281 | 
282 | 
283 | if __name__ == "__main__":
284 |   if len(sys.argv) > 1 and sys.argv[1] == 'test':
285 |     test_main()
286 |   else:
287 |     print('Usage: {0} [test] [options]'.format(sys.argv[0]), file=sys.stderr)
288 | 


--------------------------------------------------------------------------------
/elmoformanylangs/configs/cnn_0_100_512_4096_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"encoder": {
 3 | 		"name": "elmo",
 4 | 		"projection_dim": 512, 
 5 | 		"cell_clip": 3, 
 6 | 		"proj_clip": 3,
 7 | 		"dim": 4096,
 8 | 		"n_layers": 2
 9 |   	},
10 | 
11 |  	"token_embedder": {
12 |  		"name": "cnn",
13 |  		"activation": "relu",
14 |  		"filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
15 |  		"n_highway": 2, 
16 |  		"word_dim": 100,
17 |  		"char_dim": 50,
18 | 		"max_characters_per_token": 50 		
19 |  	},
20 | 	
21 | 	"classifier": {
22 | 		"name": "sampled_softmax",
23 | 		"n_samples": 8192
24 | 	},
25 | 	"dropout": 0.1
26 | }
27 | 


--------------------------------------------------------------------------------
/elmoformanylangs/configs/cnn_50_100_512_4096_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"encoder": {
 3 | 		"name": "elmo",
 4 | 		"projection_dim": 512, 
 5 | 		"cell_clip": 3, 
 6 | 		"proj_clip": 3,
 7 | 		"dim": 4096,
 8 | 		"n_layers": 2
 9 |   	},
10 | 
11 |  	"token_embedder": {
12 |  		"name": "cnn",
13 |  		"activation": "relu",
14 |  		"filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
15 |  		"n_highway": 2, 
16 |  		"word_dim": 100,
17 |  		"char_dim": 50,
18 | 		"max_characters_per_token": 50 		
19 |  	},
20 | 	
21 | 	"classifier": {
22 | 		"name": "sampled_softmax",
23 | 		"n_samples": 8192
24 | 	},
25 | 	"dropout": 0.1
26 | }
27 | 


--------------------------------------------------------------------------------
/elmoformanylangs/dataloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import unicode_literals
 3 | import codecs
 4 | import numpy as np
 5 | 
 6 | 
 7 | def pad(sequences, pad_token='<pad>', pad_left=False):
 8 |   """
 9 |   input sequences is a list of text sequence [[str]]
10 |   pad each text sequence to the length of the longest
11 | 
12 |   :param sequences:
13 |   :param pad_token:
14 |   :param pad_left:
15 |   :return:
16 |   """
17 |   # max_len = max(5,max(len(seq) for seq in sequences))
18 |   max_len = max(len(seq) for seq in sequences)
19 |   if pad_left:
20 |     return [[pad_token]*(max_len-len(seq)) + seq for seq in sequences]
21 |   return [seq + [pad_token]*(max_len-len(seq)) for seq in sequences]
22 | 
23 | 
24 | def load_embedding_npz(path):
25 |   data = np.load(path)
26 |   return [str(w) for w in data['words']], data['vals']
27 | 
28 | 
29 | def load_embedding_txt(path):
30 |   words = []
31 |   vals = []
32 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
33 |     fin.readline()
34 |     for line in fin:
35 |       line = line.strip()
36 |       if line:
37 |         parts = line.split()
38 |         words.append(parts[0])
39 |         vals += [float(x) for x in parts[1:]]  # equal to append
40 |   return words, np.asarray(vals).reshape(len(words), -1)  # reshape
41 | 
42 | 
43 | def load_embedding(path):
44 |   if path.endswith(".npz"):
45 |     return load_embedding_npz(path)
46 |   else:
47 |     return load_embedding_txt(path)
48 | 


--------------------------------------------------------------------------------
/elmoformanylangs/elmo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from __future__ import unicode_literals
  4 | import os
  5 | import codecs
  6 | import random
  7 | import logging
  8 | import json
  9 | import torch
 10 | from .modules.embedding_layer import EmbeddingLayer
 11 | from .utils import dict2namedtuple
 12 | from .frontend import create_one_batch
 13 | from .frontend import Model
 14 | import numpy as np
 15 | 
 16 | logger = logging.getLogger('elmoformanylangs')
 17 | 
 18 | 
 19 | def read_list(sents, max_chars=None):
 20 |     """
 21 |     read raw text file. The format of the input is like, one sentence per line
 22 |     words are separated by '\t'
 23 | 
 24 |     :param path:
 25 |     :param max_chars: int, the number of maximum characters in a word, this
 26 |       parameter is used when the ptm is configured with CNN word encoder.
 27 |     :return:
 28 |     """
 29 |     dataset = []
 30 |     textset = []
 31 |     for sent in sents:
 32 |         data = ['<bos>']
 33 |         text = []
 34 |         for token in sent:
 35 |             text.append(token)
 36 |             if max_chars is not None and len(token) + 2 > max_chars:
 37 |                 token = token[:max_chars - 2]
 38 |             data.append(token)
 39 |         data.append('<eos>')
 40 |         dataset.append(data)
 41 |         textset.append(text)
 42 |     return dataset, textset
 43 | 
 44 | 
 45 | def recover(li, ind):
 46 |     # li[piv], ind = torch.sort(li[piv], dim=0, descending=(not unsort))
 47 |     dummy = list(range(len(ind)))
 48 |     dummy.sort(key=lambda l: ind[l])
 49 |     li = [li[i] for i in dummy]
 50 |     return li
 51 | 
 52 | 
 53 | # shuffle training examples and create mini-batches
 54 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=False, sort=True, text=None):
 55 |     ind = list(range(len(x)))
 56 |     lst = perm or list(range(len(x)))
 57 |     if shuffle:
 58 |         random.shuffle(lst)
 59 | 
 60 |     if sort:
 61 |         lst.sort(key=lambda l: -len(x[l]))
 62 | 
 63 |     x = [x[i] for i in lst]
 64 |     ind = [ind[i] for i in lst]
 65 |     if text is not None:
 66 |         text = [text[i] for i in lst]
 67 | 
 68 |     sum_len = 0.0
 69 |     batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], []
 70 |     size = batch_size
 71 |     nbatch = (len(x) - 1) // size + 1
 72 |     for i in range(nbatch):
 73 |         start_id, end_id = i * size, (i + 1) * size
 74 |         bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
 75 |         sum_len += sum(blens)
 76 |         batches_w.append(bw)
 77 |         batches_c.append(bc)
 78 |         batches_lens.append(blens)
 79 |         batches_masks.append(bmasks)
 80 |         batches_ind.append(ind[start_id: end_id])
 81 |         if text is not None:
 82 |             batches_text.append(text[start_id: end_id])
 83 | 
 84 |     if sort:
 85 |         perm = list(range(nbatch))
 86 |         random.shuffle(perm)
 87 |         batches_w = [batches_w[i] for i in perm]
 88 |         batches_c = [batches_c[i] for i in perm]
 89 |         batches_lens = [batches_lens[i] for i in perm]
 90 |         batches_masks = [batches_masks[i] for i in perm]
 91 |         batches_ind = [batches_ind[i] for i in perm]
 92 |         if text is not None:
 93 |             batches_text = [batches_text[i] for i in perm]
 94 | 
 95 |     logger.info("{} batches, avg len: {:.1f}".format(
 96 |         nbatch, sum_len / len(x)))
 97 |     recover_ind = [item for sublist in batches_ind for item in sublist]
 98 |     if text is not None:
 99 |         return batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind
100 |     return batches_w, batches_c, batches_lens, batches_masks, recover_ind
101 | 
102 | 
103 | class Embedder(object):
104 |     def __init__(self, model_dir, batch_size=64):
105 |         self.model_dir = model_dir
106 |         self.model, self.config = self.get_model()
107 |         self.batch_size = batch_size
108 | 
109 |     def get_model(self):
110 |         # torch.cuda.set_device(1)
111 |         self.use_cuda = torch.cuda.is_available()
112 |         # load the ptm configurations
113 |         args2 = dict2namedtuple(json.load(codecs.open(
114 |             os.path.join(self.model_dir, 'config.json'), 'r', encoding='utf-8')))
115 | 
116 |         config_path = os.path.join(self.model_dir, args2.config_path)
117 |         # Some of the available models may have the config in the
118 |         # ptm dir, but the path given in the config directory was an
119 |         # absolute path.
120 |         if not os.path.exists(config_path):
121 |             config_path = os.path.join(self.model_dir,
122 |                                        os.path.split(config_path)[1])
123 |             logger.warning("Could not find config.  Trying " + config_path)
124 |         # In many cases, such as the publicly available English ptm,
125 |         # the config is one of the default provided configs in
126 |         # elmoformanylangs/configs
127 |         if not os.path.exists(config_path):
128 |             config_path = os.path.join(os.path.split(__file__)[0], "configs",
129 |                                        os.path.split(config_path)[1])
130 |             logger.warning("Could not find config.  Trying " + config_path)
131 | 
132 |         if not os.path.exists(config_path):
133 |             raise FileNotFoundError("Could not find the ptm config in either the ptm directory "
134 |                                     "or the default configs.  Path in config file: %s" % args2.config_path)
135 | 
136 |         with open(config_path, 'r') as fin:
137 |             config = json.load(fin)
138 | 
139 |         # For the ptm trained with character-based word encoder.
140 |         if config['token_embedder']['char_dim'] > 0:
141 |             self.char_lexicon = {}
142 |             with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
143 |                 for line in fpi:
144 |                     tokens = line.strip().split('\t')
145 |                     if len(tokens) == 1:
146 |                         tokens.insert(0, '\u3000')
147 |                     token, i = tokens
148 |                     self.char_lexicon[token] = int(i)
149 |             char_emb_layer = EmbeddingLayer(
150 |                 config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None)
151 |             logger.info('char embedding size: ' +
152 |                         str(len(char_emb_layer.word2id)))
153 |         else:
154 |             self.char_lexicon = None
155 |             char_emb_layer = None
156 | 
157 |         # For the ptm trained with word form word encoder.
158 |         if config['token_embedder']['word_dim'] > 0:
159 |             self.word_lexicon = {}
160 |             with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
161 |                 for line in fpi:
162 |                     tokens = line.strip().split('\t')
163 |                     if len(tokens) == 1:
164 |                         tokens.insert(0, '\u3000')
165 |                     token, i = tokens
166 |                     self.word_lexicon[token] = int(i)
167 |             word_emb_layer = EmbeddingLayer(
168 |                 config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None)
169 |             logger.info('word embedding size: ' +
170 |                         str(len(word_emb_layer.word2id)))
171 |         else:
172 |             self.word_lexicon = None
173 |             word_emb_layer = None
174 | 
175 |         # instantiate the ptm
176 |         model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda)
177 | 
178 |         if self.use_cuda:
179 |             model.cuda()
180 | 
181 |         logger.info(str(model))
182 |         model.load_model(self.model_dir)
183 | 
184 |         # read test data according to input format
185 | 
186 |         # configure the ptm to evaluation mode.
187 |         model.eval()
188 |         return model, config
189 | 
190 |     def sents2elmo(self, sents, output_layer=-1):
191 |         read_function = read_list
192 | 
193 |         if self.config['token_embedder']['name'].lower() == 'cnn':
194 |             test, text = read_function(sents, self.config['token_embedder']['max_characters_per_token'])
195 |         else:
196 |             test, text = read_function(sents)
197 | 
198 |         # create test batches from the input data.
199 |         test_w, test_c, test_lens, test_masks, test_text, recover_ind = create_batches(
200 |             test, self.batch_size, self.word_lexicon, self.char_lexicon, self.config, text=text)
201 | 
202 |         cnt = 0
203 | 
204 |         after_elmo = []
205 |         for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
206 |             output = self.model.forward(w, c, masks)
207 |             for i, text in enumerate(texts):
208 | 
209 |                 if self.config['encoder']['name'].lower() == 'lstm':
210 |                     data = output[i, 1:lens[i]-1, :].data
211 |                     if self.use_cuda:
212 |                         data = data.cpu()
213 |                     data = data.numpy()
214 |                 elif self.config['encoder']['name'].lower() == 'elmo':
215 |                     data = output[:, i, 1:lens[i]-1, :].data
216 |                     if self.use_cuda:
217 |                         data = data.cpu()
218 |                     data = data.numpy()
219 | 
220 |                 if output_layer == -1:
221 |                     payload = np.average(data, axis=0)
222 |                 elif output_layer == -2:
223 |                     payload = data
224 |                 else:
225 |                     payload = data[output_layer]
226 |                 after_elmo.append(payload)
227 | 
228 |                 cnt += 1
229 |                 if cnt % 1000 == 0:
230 |                     logger.info('Finished {0} sentences.'.format(cnt))
231 | 
232 |         after_elmo = recover(after_elmo, recover_ind)
233 |         return after_elmo
234 | 


--------------------------------------------------------------------------------
/elmoformanylangs/frontend.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import random
  4 | import torch
  5 | import torch.nn as nn
  6 | import logging
  7 | from torch.autograd import Variable
  8 | from .modules.elmo import ElmobiLm
  9 | from .modules.lstm import LstmbiLm
 10 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder
 11 | 
 12 | logger = logging.getLogger('elmoformanylangs')
 13 | 
 14 | def create_one_batch(x, word2id, char2id, config, oov='<oov>', pad='<pad>', sort=True):
 15 |   """
 16 |   Create one batch of input.
 17 | 
 18 |   :param x: List[List[str]]
 19 |   :param word2id: Dict | None
 20 |   :param char2id: Dict | None
 21 |   :param config: Dict
 22 |   :param oov: str, the form of OOV token.
 23 |   :param pad: str, the form of padding token.
 24 |   :param sort: bool, specify whether sorting the sentences by their lengths.
 25 |   :return:
 26 |   """
 27 |   batch_size = len(x)
 28 |   # lst represents the order of sentences
 29 |   lst = list(range(batch_size))
 30 |   if sort:
 31 |     lst.sort(key=lambda l: -len(x[l]))
 32 | 
 33 |   # shuffle the sentences by
 34 |   x = [x[i] for i in lst]
 35 |   lens = [len(x[i]) for i in lst]
 36 |   max_len = max(lens)
 37 | 
 38 |   # get a batch of word id whose size is (batch x max_len)
 39 |   if word2id is not None:
 40 |     oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None)
 41 |     assert oov_id is not None and pad_id is not None
 42 |     batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
 43 |     for i, x_i in enumerate(x):
 44 |       for j, x_ij in enumerate(x_i):
 45 |         batch_w[i][j] = word2id.get(x_ij, oov_id)
 46 |   else:
 47 |     batch_w = None
 48 | 
 49 |   # get a batch of character id whose size is (batch x max_len x max_chars)
 50 |   if char2id is not None:
 51 |     bow_id, eow_id, oov_id, pad_id = [char2id.get(key, None) for key in ('<eow>', '<bow>', oov, pad)]
 52 | 
 53 |     assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None
 54 | 
 55 |     if config['token_embedder']['name'].lower() == 'cnn':
 56 |       max_chars = config['token_embedder']['max_characters_per_token']
 57 |       assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars
 58 |     elif config['token_embedder']['name'].lower() == 'lstm':
 59 |       # counting the <bow> and <eow>
 60 |       max_chars = max([len(w) for i in lst for w in x[i]]) + 2
 61 |     else:
 62 |       raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
 63 | 
 64 |     batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
 65 | 
 66 |     for i, x_i in enumerate(x):
 67 |       for j, x_ij in enumerate(x_i):
 68 |         batch_c[i][j][0] = bow_id
 69 |         if x_ij == '<bos>' or x_ij == '<eos>':
 70 |           batch_c[i][j][1] = char2id.get(x_ij)
 71 |           batch_c[i][j][2] = eow_id
 72 |         else:
 73 |           for k, c in enumerate(x_ij):
 74 |             batch_c[i][j][k + 1] = char2id.get(c, oov_id)
 75 |           batch_c[i][j][len(x_ij) + 1] = eow_id
 76 |   else:
 77 |     batch_c = None
 78 | 
 79 |   # mask[0] is the matrix (batch x max_len) indicating whether
 80 |   # there is an id is valid (not a padding) in this batch.
 81 |   # mask[1] stores the flattened ids indicating whether there is a valid
 82 |   # previous token
 83 |   # mask[2] stores the flattened ids indicating whether there is a valid
 84 |   # next token
 85 |   masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
 86 | 
 87 |   for i, x_i in enumerate(x):
 88 |     for j in range(len(x_i)):
 89 |       masks[0][i][j] = 1
 90 |       if j + 1 < len(x_i):
 91 |         masks[1].append(i * max_len + j)
 92 |       if j > 0:
 93 |         masks[2].append(i * max_len + j)
 94 | 
 95 |   assert len(masks[1]) <= batch_size * max_len
 96 |   assert len(masks[2]) <= batch_size * max_len
 97 | 
 98 |   masks[1] = torch.LongTensor(masks[1])
 99 |   masks[2] = torch.LongTensor(masks[2])
100 | 
101 |   return batch_w, batch_c, lens, masks
102 | 
103 | 
104 | # shuffle training examples and create mini-batches
105 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, text=None):
106 |   """
107 | 
108 |   :param x: List[List[str]]
109 |   :param batch_size:
110 |   :param word2id:
111 |   :param char2id:
112 |   :param config:
113 |   :param perm:
114 |   :param shuffle:
115 |   :param sort:
116 |   :param text:
117 |   :return:
118 |   """
119 |   lst = perm or list(range(len(x)))
120 |   if shuffle:
121 |     random.shuffle(lst)
122 | 
123 |   if sort:
124 |     lst.sort(key=lambda l: -len(x[l]))
125 | 
126 |   x = [x[i] for i in lst]
127 |   if text is not None:
128 |     text = [text[i] for i in lst]
129 | 
130 |   sum_len = 0.0
131 |   batches_w, batches_c, batches_lens, batches_masks, batches_text = [], [], [], [], []
132 |   size = batch_size
133 |   nbatch = (len(x) - 1) // size + 1
134 |   for i in range(nbatch):
135 |     start_id, end_id = i * size, (i + 1) * size
136 |     bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
137 |     sum_len += sum(blens)
138 |     batches_w.append(bw)
139 |     batches_c.append(bc)
140 |     batches_lens.append(blens)
141 |     batches_masks.append(bmasks)
142 |     if text is not None:
143 |       batches_text.append(text[start_id: end_id])
144 | 
145 |   if sort:
146 |     perm = list(range(nbatch))
147 |     random.shuffle(perm)
148 |     batches_w = [batches_w[i] for i in perm]
149 |     batches_c = [batches_c[i] for i in perm]
150 |     batches_lens = [batches_lens[i] for i in perm]
151 |     batches_masks = [batches_masks[i] for i in perm]
152 |     if text is not None:
153 |       batches_text = [batches_text[i] for i in perm]
154 | 
155 |   logger.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x)))
156 |   if text is not None:
157 |     return batches_w, batches_c, batches_lens, batches_masks, batches_text
158 |   return batches_w, batches_c, batches_lens, batches_masks
159 | 
160 | 
161 | class Model(nn.Module):
162 |   def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False):
163 |     super(Model, self).__init__()
164 |     self.use_cuda = use_cuda
165 |     self.config = config
166 | 
167 |     if config['token_embedder']['name'].lower() == 'cnn':
168 |       self.token_embedder = ConvTokenEmbedder(
169 |         config, word_emb_layer, char_emb_layer, use_cuda)
170 |     elif config['token_embedder']['name'].lower() == 'lstm':
171 |       self.token_embedder = LstmTokenEmbedder(
172 |         config, word_emb_layer, char_emb_layer, use_cuda)
173 | 
174 |     if config['encoder']['name'].lower() == 'elmo':
175 |       self.encoder = ElmobiLm(config, use_cuda)
176 |     elif config['encoder']['name'].lower() == 'lstm':
177 |       self.encoder = LstmbiLm(config, use_cuda)
178 | 
179 |     self.output_dim = config['encoder']['projection_dim']
180 | 
181 |   def forward(self, word_inp, chars_package, mask_package):
182 |     """
183 | 
184 |     :param word_inp:
185 |     :param chars_package:
186 |     :param mask_package:
187 |     :return:
188 |     """
189 |     token_embedding = self.token_embedder(word_inp, chars_package, (mask_package[0].size(0), mask_package[0].size(1)))
190 |     if self.config['encoder']['name'] == 'elmo':
191 |       mask = Variable(mask_package[0]).cuda() if self.use_cuda else Variable(mask_package[0])
192 |       encoder_output = self.encoder(token_embedding, mask)
193 |       sz = encoder_output.size()
194 |       token_embedding = torch.cat(
195 |         [token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
196 |       encoder_output = torch.cat(
197 |         [token_embedding, encoder_output], dim=0)
198 |     elif self.config['encoder']['name'] == 'lstm':
199 |       encoder_output = self.encoder(token_embedding)
200 |     else:
201 |       raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name']))
202 | 
203 |     return encoder_output
204 | 
205 |   def load_model(self, path):
206 |     self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'),
207 |                                                    map_location=lambda storage, loc: storage))
208 |     self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'),
209 |                                             map_location=lambda storage, loc: storage))
210 | 


--------------------------------------------------------------------------------
/elmoformanylangs/main.py:
--------------------------------------------------------------------------------
 1 | from elmoformanylangs import Embedder
 2 | import jieba
 3 | 
 4 | sentence = '我爱自然语言处理'
 5 | 
 6 | segment = list(jieba.cut(sentence))
 7 | print(segment)
 8 | model = Embedder('../ptm/elmo')
 9 | vec = model.sents2elmo([segment])
10 | print(vec)
11 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/elmoformanylangs/modules/__init__.py


--------------------------------------------------------------------------------
/elmoformanylangs/modules/classify_layer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | class SoftmaxLayer(nn.Module):
  9 |   """ Naive softmax-layer """
 10 |   def __init__(self, output_dim, n_class):
 11 |     """
 12 | 
 13 |     :param output_dim: int
 14 |     :param n_class: int
 15 |     """
 16 |     super(SoftmaxLayer, self).__init__()
 17 |     self.hidden2tag = nn.Linear(output_dim, n_class)
 18 |     self.criterion = nn.CrossEntropyLoss(size_average=False)
 19 | 
 20 |   def forward(self, x, y):
 21 |     """
 22 | 
 23 |     :param x: torch.Tensor
 24 |     :param y: torch.Tensor
 25 |     :return:
 26 |     """
 27 |     tag_scores = self.hidden2tag(x)
 28 |     return self.criterion(tag_scores, y)
 29 | 
 30 | 
 31 | class SampledSoftmaxLayer(nn.Module):
 32 |   """
 33 | 
 34 |   """
 35 |   def __init__(self, output_dim, n_class, n_samples, use_cuda):
 36 |     """
 37 | 
 38 |     :param output_dim:
 39 |     :param n_class:
 40 |     :param n_samples:
 41 |     :param use_cuda:
 42 |     """
 43 |     super(SampledSoftmaxLayer, self).__init__()
 44 |     self.n_samples = n_samples
 45 |     self.n_class = n_class
 46 |     self.use_cuda = use_cuda
 47 |     self.criterion = nn.CrossEntropyLoss(size_average=False)
 48 |     self.negative_samples = []
 49 |     self.word_to_column = {0: 0}
 50 | 
 51 |     self.all_word = []
 52 |     self.all_word_to_column = {0: 0}
 53 | 
 54 |     self.column_emb = nn.Embedding(n_class, output_dim)
 55 |     self.column_emb.weight.data.uniform_(-0.25, 0.25)
 56 | 
 57 |     self.column_bias = nn.Embedding(n_class, 1)
 58 |     self.column_bias.weight.data.uniform_(-0.25, 0.25)
 59 | 
 60 |     self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1))
 61 |     self.oov_column.data.uniform_(-0.25, 0.25)
 62 | 
 63 |   def forward(self, x, y):
 64 |     if self.training:
 65 |       for i in range(y.size(0)):
 66 |         y[i] = self.word_to_column.get(y[i].tolist())
 67 |       samples = torch.LongTensor(len(self.word_to_column)).fill_(0)
 68 |       for word in self.negative_samples:
 69 |         samples[self.word_to_column[word]] = word
 70 |     else:
 71 |       for i in range(y.size(0)):
 72 |         y[i] = self.all_word_to_column.get(y[i].tolist(), 0)
 73 |       samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0)
 74 |       for word in self.all_word:
 75 |         samples[self.all_word_to_column[word]] = word
 76 | 
 77 |     if self.use_cuda:
 78 |       samples = samples.cuda()
 79 | 
 80 |     tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \
 81 |                  (self.column_bias.forward(samples)).view(1, -1) 
 82 |     return self.criterion(tag_scores, y)
 83 | 
 84 |   def update_embedding_matrix(self):
 85 |     word_inp, chars_inp = [], []
 86 |     if self.training:  
 87 |       columns = torch.LongTensor(len(self.negative_samples) + 1)
 88 |       samples = self.negative_samples
 89 |       for i, word in enumerate(samples):
 90 |         columns[self.word_to_column[word]] = word
 91 |       columns[0] = 0
 92 |     else:
 93 |       columns = torch.LongTensor(len(self.all_word) + 1)
 94 |       samples = self.all_word
 95 |       for i, word in enumerate(samples):
 96 |         columns[self.all_word_to_column[word]] = word
 97 |       columns[0] = 0
 98 | 
 99 |     if self.use_cuda:
100 |       columns = columns.cuda()
101 |     self.embedding_matrix = self.column_emb.forward(columns).transpose(0, 1)
102 | 
103 |   def update_negative_samples(self, word_inp, chars_inp, mask):
104 |     batch_size, seq_len = word_inp.size(0), word_inp.size(1)
105 |     in_batch = set()
106 |     for i in range(batch_size):
107 |       for j in range(seq_len):
108 |         if mask[i][j] == 0:
109 |           continue
110 |         word = word_inp[i][j].tolist()
111 |         in_batch.add(word)
112 |     for i in range(batch_size):
113 |       for j in range(seq_len):
114 |         if mask[i][j] == 0:
115 |           continue
116 |         word = word_inp[i][j].tolist()
117 |         if word not in self.all_word_to_column:
118 |           self.all_word.append(word)
119 |           self.all_word_to_column[word] = len(self.all_word_to_column)
120 | 
121 |         if word not in self.word_to_column:
122 |           if len(self.negative_samples) < self.n_samples:
123 |             self.negative_samples.append(word)
124 |             self.word_to_column[word] = len(self.word_to_column)
125 |           else:
126 |             while self.negative_samples[0] in in_batch:
127 |               self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]]
128 |             self.word_to_column[word] = self.word_to_column.pop(self.negative_samples[0])
129 |             self.negative_samples = self.negative_samples[1:] + [word]
130 | 
131 | 
132 | class CNNSoftmaxLayer(nn.Module):
133 |   def __init__(self, token_embedder, output_dim, n_class, n_samples, corr_dim, use_cuda):
134 |     super(CNNSoftmaxLayer, self).__init__()
135 |     self.token_embedder = token_embedder
136 |     self.n_samples = n_samples
137 |     self.use_cuda = use_cuda
138 |     self.criterion = nn.CrossEntropyLoss(size_average=False)
139 |     self.negative_samples = []
140 |     self.word_to_column = {0: 0}
141 | 
142 |     self.all_word = []
143 |     self.all_word_to_column = {0: 0}
144 | 
145 |     self.M = nn.Parameter(torch.Tensor(output_dim, corr_dim))
146 |     stdv = 1. / math.sqrt(self.M.size(1))
147 |     self.M.data.uniform_(-stdv, stdv)
148 | 
149 |     self.corr = nn.Embedding(n_class, corr_dim)
150 |     self.corr.weight.data.uniform_(-0.25, 0.25)
151 | 
152 |     self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1))
153 |     self.oov_column.data.uniform_(-0.25, 0.25)
154 | 
155 |   def forward(self, x, y):
156 |     if self.training:
157 |       for i in range(y.size(0)):
158 |         y[i] = self.word_to_column.get(y[i].tolist())
159 |       samples = torch.LongTensor(len(self.word_to_column)).fill_(0)
160 |       for package in self.negative_samples:
161 |         samples[self.word_to_column[package[0]]] = package[0]
162 |     else:
163 |       for i in range(y.size(0)):
164 |         y[i] = self.all_word_to_column.get(y[i].tolist(), 0)
165 |       samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0)
166 |       for package in self.all_word:
167 |         samples[self.all_word_to_column[package[0]]] = package[0]
168 | 
169 |     if self.use_cuda:
170 |       samples = samples.cuda()
171 | 
172 |     tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \
173 |                  (x.matmul(self.M).matmul(self.corr.forward(samples).transpose(0, 1))).view(y.size(0), -1)
174 |     return self.criterion(tag_scores, y)
175 | 
176 |   def update_embedding_matrix(self):
177 |     batch_size = 2048
178 |     word_inp, chars_inp = [], []
179 |     if self.training:  
180 |       sub_matrices = [self.oov_column]
181 |       samples = self.negative_samples
182 |       id2pack = {}
183 |       for i, package in enumerate(samples):
184 |         id2pack[self.word_to_column[package[0]]] = i
185 |     else:
186 |       sub_matrices = [self.oov_column]
187 |       samples = self.all_word
188 |       id2pack = {}
189 |       for i, package in enumerate(samples):
190 |         id2pack[self.all_word_to_column[package[0]]] = i
191 | 
192 |     for i in range(len(samples)):
193 |       # [n_samples, 1], [n_samples, 1, x], [n_samples, 1]
194 |       word_inp.append(samples[id2pack[i + 1]][0])
195 |       chars_inp.append(samples[id2pack[i + 1]][1])
196 |       if len(word_inp) == batch_size or i == len(samples) - 1:
197 |         sub_matrices.append(self.token_embedder.forward(torch.LongTensor(word_inp).view(len(word_inp), 1),
198 |                                                         None if chars_inp[0] is None else torch.LongTensor(chars_inp).view(len(word_inp), 1, len(package[1])), 
199 |                                                         (len(word_inp), 1)).squeeze(1).transpose(0, 1))
200 |         if not self.training:
201 |           sub_matrices[-1] = sub_matrices[-1].detach()
202 |         word_inp, chars_inp = [], [] 
203 | 
204 |     sum = 0
205 |     for mat in sub_matrices:
206 |       sum += mat.size(1)
207 |     #print(sum, len(self.word_to_column))    
208 |     self.embedding_matrix = torch.cat(sub_matrices, dim=1)
209 | 
210 |   def update_negative_samples(self, word_inp, chars_inp, mask):
211 |     batch_size, seq_len = word_inp.size(0), word_inp.size(1)
212 |     in_batch = set()
213 |     for i in range(batch_size):
214 |       for j in range(seq_len):
215 |         if mask[i][j] == 0:
216 |           continue
217 |         word = word_inp[i][j].tolist()
218 |         in_batch.add(word)
219 |     for i in range(batch_size):
220 |       for j in range(seq_len):
221 |         if mask[i][j] == 0:
222 |           continue
223 |         package = (word_inp[i][j].tolist(), None if chars_inp is None else chars_inp[i][j].tolist())
224 |         if package[0] not in self.all_word_to_column:
225 |           self.all_word.append(package)
226 |           self.all_word_to_column[package[0]] = len(self.all_word_to_column)
227 | 
228 |         if package[0] not in self.word_to_column:
229 |           if len(self.negative_samples) < self.n_samples:
230 |             self.negative_samples.append(package)
231 |             self.word_to_column[package[0]] = len(self.word_to_column)
232 |           else:
233 |             while self.negative_samples[0][0] in in_batch:
234 |               self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]]
235 |             self.word_to_column[package[0]] = self.word_to_column.pop(self.negative_samples[0][0])
236 |             self.negative_samples = self.negative_samples[1:] + [package]
237 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/elmo.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple, List, Callable, Union
  2 | 
  3 | import h5py
  4 | import numpy
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence
  9 | from torch.autograd import Variable
 10 | 
 11 | from .encoder_base import _EncoderBase
 12 | from .lstm_cell_with_projection import LstmCellWithProjection
 13 | 
 14 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]  # pylint: disable=invalid-name
 15 | RnnStateStorage = Tuple[torch.Tensor, ...]  # pylint: disable=invalid-name
 16 | 
 17 | 
 18 | class ElmobiLm(_EncoderBase):
 19 |   def __init__(self, config, use_cuda=False):
 20 |     super(ElmobiLm, self).__init__(stateful=True)
 21 |     self.config = config
 22 |     self.use_cuda = use_cuda
 23 |     input_size = config['encoder']['projection_dim']
 24 |     hidden_size = config['encoder']['projection_dim']
 25 |     cell_size = config['encoder']['dim']
 26 |     num_layers = config['encoder']['n_layers']
 27 |     memory_cell_clip_value = config['encoder']['cell_clip']
 28 |     state_projection_clip_value = config['encoder']['proj_clip']
 29 |     recurrent_dropout_probability = config['dropout']
 30 | 
 31 |     self.input_size = input_size
 32 |     self.hidden_size = hidden_size
 33 |     self.num_layers = num_layers
 34 |     self.cell_size = cell_size
 35 |     
 36 |     forward_layers = []
 37 |     backward_layers = []
 38 | 
 39 |     lstm_input_size = input_size
 40 |     go_forward = True
 41 |     for layer_index in range(num_layers):
 42 |       forward_layer = LstmCellWithProjection(lstm_input_size,
 43 |                                              hidden_size,
 44 |                                              cell_size,
 45 |                                              go_forward,
 46 |                                              recurrent_dropout_probability,
 47 |                                              memory_cell_clip_value,
 48 |                                              state_projection_clip_value)
 49 |       backward_layer = LstmCellWithProjection(lstm_input_size,
 50 |                                               hidden_size,
 51 |                                               cell_size,
 52 |                                               not go_forward,
 53 |                                               recurrent_dropout_probability,
 54 |                                               memory_cell_clip_value,
 55 |                                               state_projection_clip_value)
 56 |       lstm_input_size = hidden_size
 57 | 
 58 |       self.add_module('forward_layer_{}'.format(layer_index), forward_layer)
 59 |       self.add_module('backward_layer_{}'.format(layer_index), backward_layer)
 60 |       forward_layers.append(forward_layer)
 61 |       backward_layers.append(backward_layer)
 62 |     self.forward_layers = forward_layers
 63 |     self.backward_layers = backward_layers
 64 | 
 65 |   def forward(self, inputs, mask):
 66 |     batch_size, total_sequence_length = mask.size()
 67 |     stacked_sequence_output, final_states, restoration_indices = \
 68 |       self.sort_and_run_forward(self._lstm_forward, inputs, mask)
 69 | 
 70 |     num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size()
 71 |     # Add back invalid rows which were removed in the call to sort_and_run_forward.
 72 |     if num_valid < batch_size:
 73 |       zeros = stacked_sequence_output.data.new(num_layers,
 74 |                                                batch_size - num_valid,
 75 |                                                returned_timesteps,
 76 |                                                encoder_dim).fill_(0)
 77 |       zeros = Variable(zeros)
 78 |       stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1)
 79 | 
 80 |       # The states also need to have invalid rows added back.
 81 |       new_states = []
 82 |       for state in final_states:
 83 |         state_dim = state.size(-1)
 84 |         zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0)
 85 |         zeros = Variable(zeros)
 86 |         new_states.append(torch.cat([state, zeros], 1))
 87 |       final_states = new_states
 88 | 
 89 |     # It's possible to need to pass sequences which are padded to longer than the
 90 |     # max length of the sequence to a Seq2StackEncoder. However, packing and unpacking
 91 |     # the sequences mean that the returned tensor won't include these dimensions, because
 92 |     # the RNN did not need to process them. We add them back on in the form of zeros here.
 93 |     sequence_length_difference = total_sequence_length - returned_timesteps
 94 |     if sequence_length_difference > 0:
 95 |       zeros = stacked_sequence_output.data.new(num_layers,
 96 |                                                batch_size,
 97 |                                                sequence_length_difference,
 98 |                                                stacked_sequence_output[0].size(-1)).fill_(0)
 99 |       zeros = Variable(zeros)
100 |       stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2)
101 | 
102 |     self._update_states(final_states, restoration_indices)
103 | 
104 |     # Restore the original indices and return the sequence.
105 |     # Has shape (num_layers, batch_size, sequence_length, hidden_size)
106 |     return stacked_sequence_output.index_select(1, restoration_indices)
107 | 
108 | 
109 |   def _lstm_forward(self, 
110 |                     inputs: PackedSequence,
111 |                     initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \
112 |       Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
113 |     """
114 |     Parameters
115 |     ----------
116 |     inputs : ``PackedSequence``, required.
117 |       A batch first ``PackedSequence`` to run the stacked LSTM over.
118 |     initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
119 |       A tuple (state, memory) representing the initial hidden state and memory
120 |       of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
121 |       (num_layers, batch_size, 2 * cell_size) respectively.
122 |     Returns
123 |     -------
124 |     output_sequence : ``torch.FloatTensor``
125 |       The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size)
126 |     final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
127 |       The per-layer final (state, memory) states of the LSTM, with shape
128 |       (num_layers, batch_size, 2 * hidden_size) and  (num_layers, batch_size, 2 * cell_size)
129 |       respectively. The last dimension is duplicated because it contains the state/memory
130 |       for both the forward and backward layers.
131 |     """
132 |        
133 |     if initial_state is None:
134 |       hidden_states: List[Optional[Tuple[torch.Tensor,
135 |                                    torch.Tensor]]] = [None] * len(self.forward_layers)
136 |     elif initial_state[0].size()[0] != len(self.forward_layers):
137 |       raise Exception("Initial states were passed to forward() but the number of "
138 |                                "initial states does not match the number of layers.")
139 |     else:
140 |       hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))
141 | 
142 |     inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
143 |     forward_output_sequence = inputs
144 |     backward_output_sequence = inputs
145 | 
146 |     final_states = []
147 |     sequence_outputs = []
148 |     for layer_index, state in enumerate(hidden_states):
149 |       forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index))
150 |       backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index))
151 | 
152 |       forward_cache = forward_output_sequence
153 |       backward_cache = backward_output_sequence
154 | 
155 |       if state is not None:
156 |         forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)
157 |         forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)
158 |         forward_state = (forward_hidden_state, forward_memory_state)
159 |         backward_state = (backward_hidden_state, backward_memory_state)
160 |       else:
161 |         forward_state = None
162 |         backward_state = None
163 | 
164 |       forward_output_sequence, forward_state = forward_layer(forward_output_sequence,
165 |                                                              batch_lengths,
166 |                                                              forward_state)
167 |       backward_output_sequence, backward_state = backward_layer(backward_output_sequence,
168 |                                                                 batch_lengths,
169 |                                                                 backward_state)
170 |       # Skip connections, just adding the input to the output.
171 |       if layer_index != 0:
172 |         forward_output_sequence += forward_cache
173 |         backward_output_sequence += backward_cache
174 | 
175 |       sequence_outputs.append(torch.cat([forward_output_sequence,
176 |                                          backward_output_sequence], -1))
177 |       # Append the state tuples in a list, so that we can return
178 |       # the final states for all the layers.
179 |       final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
180 |                            torch.cat([forward_state[1], backward_state[1]], -1)))
181 | 
182 |     stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)
183 |     # Stack the hidden state and memory for each layer into 2 tensors of shape
184 |     # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
185 |     # respectively.
186 |     final_hidden_states, final_memory_states = zip(*final_states)
187 |     final_state_tuple: Tuple[torch.FloatTensor,
188 |                              torch.FloatTensor] = (torch.cat(final_hidden_states, 0),
189 |                                                    torch.cat(final_memory_states, 0))
190 |     return stacked_sequence_outputs, final_state_tuple
191 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/embedding_layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.autograd import Variable
 5 | import logging
 6 | 
 7 | logger = logging.getLogger('elmoformanylangs')
 8 | 
 9 | 
10 | class EmbeddingLayer(nn.Module):
11 |   def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='<oov>', pad='<pad>', normalize=True):
12 |     super(EmbeddingLayer, self).__init__()
13 |     if embs is not None:
14 |       embwords, embvecs = embs
15 |       # for word in embwords:
16 |       #  assert word not in word2id, "Duplicate words in pre-trained embeddings"
17 |       #  word2id[word] = len(word2id)
18 | 
19 |       logger.info("{} pre-trained word embeddings loaded.".format(len(word2id)))
20 |       if n_d != len(embvecs[0]):
21 |         logger.warning("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.".format(
22 |           n_d, len(embvecs[0]), len(embvecs[0])))
23 |         n_d = len(embvecs[0])
24 | 
25 |     self.word2id = word2id
26 |     self.id2word = {i: word for word, i in word2id.items()}
27 |     self.n_V, self.n_d = len(word2id), n_d
28 |     self.oovid = word2id[oov]
29 |     self.padid = word2id[pad]
30 |     self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid)
31 |     self.embedding.weight.data.uniform_(-0.25, 0.25)
32 | 
33 |     if embs is not None:
34 |       weight = self.embedding.weight
35 |       weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs))
36 |       logger.info("embedding shape: {}".format(weight.size()))
37 | 
38 |     if normalize:
39 |       weight = self.embedding.weight
40 |       norms = weight.data.norm(2, 1)
41 |       if norms.dim() == 1:
42 |         norms = norms.unsqueeze(1)
43 |       weight.data.div_(norms.expand_as(weight.data))
44 | 
45 |     if fix_emb:
46 |       self.embedding.weight.requires_grad = False
47 | 
48 |   def forward(self, input_):
49 |     return self.embedding(input_)
50 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/highway.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A `Highway layer <https://arxiv.org/abs/1505.00387>`_ that does a gated combination of a linear
 3 | transformation and a non-linear transformation of its input.
 4 | """
 5 | 
 6 | from typing import Callable
 7 | 
 8 | import torch
 9 | from overrides import overrides
10 | 
11 | 
12 | class Highway(torch.nn.Module):
13 |     """
14 |     A `Highway layer <https://arxiv.org/abs/1505.00387>`_ does a gated combination of a linear
15 |     transformation and a non-linear transformation of its input.  :math:`y = g * x + (1 - g) *
16 |     f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
17 |     non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
18 |     This module will apply a fixed number of highway layers to its input, returning the final
19 |     result.
20 |     Parameters
21 |     ----------
22 |     input_dim : ``int``
23 |         The dimensionality of :math:`x`.  We assume the input has shape ``(batch_size,
24 |         input_dim)``.
25 |     num_layers : ``int``, optional (default=``1``)
26 |         The number of highway layers to apply to the input.
27 |     activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
28 |         The non-linearity to use in the highway layers.
29 |     """
30 |     def __init__(self,
31 |                  input_dim: int,
32 |                  num_layers: int = 1,
33 |                  activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
34 |         super(Highway, self).__init__()
35 |         self._input_dim = input_dim
36 |         self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2)
37 |                                             for _ in range(num_layers)])
38 |         self._activation = activation
39 |         for layer in self._layers:
40 |             # We should bias the highway layer to just carry its input forward.  We do that by
41 |             # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
42 |             # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
43 |             # of the bias vector in each Linear layer.
44 |             layer.bias[input_dim:].data.fill_(1)
45 | 
46 |     @overrides
47 |     def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
48 |         current_input = inputs
49 |         for layer in self._layers:
50 |             projected_input = layer(current_input)
51 |             linear_part = current_input
52 |             # NOTE: if you modify this, think about whether you should modify the initialization
53 |             # above, too.
54 |             nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
55 |             gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
56 |             nonlinear_part = self._activation(nonlinear_part)
57 |             gate = torch.sigmoid(gate)
58 |             current_input = gate * linear_part + (1 - gate) * nonlinear_part
59 |         return current_input
60 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/lstm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import unicode_literals
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | from torch.autograd import Variable
 7 | import copy
 8 | 
 9 | 
10 | class LstmbiLm(nn.Module):
11 |   def __init__(self, config, use_cuda=False):
12 |     super(LstmbiLm, self).__init__()
13 |     self.config = config
14 |     self.use_cuda = use_cuda
15 |     
16 |     self.encoder = nn.LSTM(self.config['encoder']['projection_dim'],
17 |                            self.config['encoder']['dim'],
18 |                            num_layers=self.config['encoder']['n_layers'], 
19 |                            bidirectional=True,
20 |                            batch_first=True, 
21 |                            dropout=self.config['dropout'])
22 |     self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True)
23 | 
24 |   def forward(self, inputs):
25 |     forward, backward = self.encoder(inputs)[0].split(self.config['encoder']['dim'], 2)
26 |     return torch.cat([self.projection(forward), self.projection(backward)], dim=2)
27 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/token_embedder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import unicode_literals
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | import copy
  8 | from .highway import Highway
  9 | 
 10 | 
 11 | class LstmTokenEmbedder(nn.Module):
 12 |   def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False):
 13 |     super(LstmTokenEmbedder, self).__init__()
 14 |     self.config = config
 15 |     self.use_cuda = use_cuda
 16 |     self.word_emb_layer = word_emb_layer
 17 |     self.char_emb_layer = char_emb_layer
 18 |     self.output_dim = config['encoder']['projection_dim']
 19 |     emb_dim = 0
 20 |     if word_emb_layer is not None:
 21 |       emb_dim += word_emb_layer.n_d
 22 | 
 23 |     if char_emb_layer is not None:
 24 |       emb_dim += char_emb_layer.n_d * 2
 25 |       self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True,
 26 |                                batch_first=True, dropout=config['dropout'])
 27 | 
 28 |     self.projection = nn.Linear(emb_dim, self.output_dim, bias=True)
 29 | 
 30 |   def forward(self, word_inp, chars_inp, shape):
 31 |     embs = []
 32 |     batch_size, seq_len = shape
 33 |     if self.word_emb_layer is not None:
 34 |       word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp))
 35 |       embs.append(word_emb)
 36 | 
 37 |     if self.char_emb_layer is not None:
 38 |       chars_inp = chars_inp.view(batch_size * seq_len, -1)
 39 |       chars_emb = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp))
 40 |       _, (chars_outputs, __) = self.char_lstm(chars_emb)
 41 |       chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2)
 42 |       embs.append(chars_outputs)
 43 | 
 44 |     token_embedding = torch.cat(embs, dim=2)
 45 | 
 46 |     return self.projection(token_embedding)
 47 | 
 48 | 
 49 | class ConvTokenEmbedder(nn.Module):
 50 |   def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda):
 51 |     super(ConvTokenEmbedder, self).__init__()
 52 |     self.config = config
 53 |     self.use_cuda = use_cuda
 54 | 
 55 |     self.word_emb_layer = word_emb_layer
 56 |     self.char_emb_layer = char_emb_layer
 57 | 
 58 |     self.output_dim = config['encoder']['projection_dim']
 59 |     self.emb_dim = 0
 60 |     if word_emb_layer is not None:
 61 |       self.emb_dim += word_emb_layer.n_d
 62 | 
 63 |     if char_emb_layer is not None:
 64 |       self.convolutions = []
 65 |       cnn_config = config['token_embedder']
 66 |       filters = cnn_config['filters']
 67 |       char_embed_dim = cnn_config['char_dim']
 68 | 
 69 |       for i, (width, num) in enumerate(filters):
 70 |         conv = torch.nn.Conv1d(
 71 |           in_channels=char_embed_dim,
 72 |           out_channels=num,
 73 |           kernel_size=width,
 74 |           bias=True
 75 |         )
 76 |         self.convolutions.append(conv)
 77 | 
 78 |       self.convolutions = nn.ModuleList(self.convolutions)
 79 |       
 80 |       self.n_filters = sum(f[1] for f in filters)
 81 |       self.n_highway = cnn_config['n_highway']
 82 | 
 83 |       self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu)
 84 |       self.emb_dim += self.n_filters
 85 | 
 86 |     self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
 87 |     
 88 |   def forward(self, word_inp, chars_inp, shape):
 89 |     embs = []
 90 |     batch_size, seq_len = shape
 91 |     if self.word_emb_layer is not None:
 92 |       batch_size, seq_len = word_inp.size(0), word_inp.size(1)
 93 |       word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp))
 94 |       embs.append(word_emb)
 95 | 
 96 |     if self.char_emb_layer is not None:
 97 |       chars_inp = chars_inp.view(batch_size * seq_len, -1)
 98 | 
 99 |       character_embedding = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp))
100 | 
101 |       character_embedding = torch.transpose(character_embedding, 1, 2)
102 | 
103 |       cnn_config = self.config['token_embedder']
104 |       if cnn_config['activation'] == 'tanh':
105 |         activation = torch.nn.functional.tanh
106 |       elif cnn_config['activation'] == 'relu':
107 |         activation = torch.nn.functional.relu
108 |       else:
109 |         raise Exception("Unknown activation")
110 | 
111 |       convs = []
112 |       for i in range(len(self.convolutions)):
113 |         convolved = self.convolutions[i](character_embedding)
114 |         # (batch_size * sequence_length, n_filters for this width)
115 |         convolved, _ = torch.max(convolved, dim=-1)
116 |         convolved = activation(convolved)
117 |         convs.append(convolved)
118 |       char_emb = torch.cat(convs, dim=-1)
119 |       char_emb = self.highways(char_emb)
120 | 
121 |       embs.append(char_emb.view(batch_size, -1, self.n_filters))
122 |       
123 |     token_embedding = torch.cat(embs, dim=2)
124 | 
125 |     return self.projection(token_embedding)
126 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Assorted utilities for working with neural networks in AllenNLP.
  3 | """
  4 | from collections import defaultdict
  5 | from typing import Dict, List, Optional, Any, Tuple, Callable
  6 | import itertools
  7 | import math
  8 | import torch
  9 | from torch.autograd import Variable
 10 | 
 11 | def get_lengths_from_binary_sequence_mask(mask: torch.Tensor):
 12 |     """
 13 |     Compute sequence lengths for each batch element in a tensor using a
 14 |     binary mask.
 15 |     Parameters
 16 |     ----------
 17 |     mask : torch.Tensor, required.
 18 |         A 2D binary mask of shape (batch_size, sequence_length) to
 19 |         calculate the per-batch sequence lengths from.
 20 |     Returns
 21 |     -------
 22 |     A torch.LongTensor of shape (batch_size,) representing the lengths
 23 |     of the sequences in the batch.
 24 |     """
 25 |     return mask.long().sum(-1)
 26 | 
 27 | 
 28 | def sort_batch_by_length(tensor: torch.autograd.Variable,
 29 |                          sequence_lengths: torch.autograd.Variable):
 30 |     """
 31 |     Sort a batch first tensor by some specified lengths.
 32 |     Parameters
 33 |     ----------
 34 |     tensor : Variable(torch.FloatTensor), required.
 35 |         A batch first Pytorch tensor.
 36 |     sequence_lengths : Variable(torch.LongTensor), required.
 37 |         A tensor representing the lengths of some dimension of the tensor which
 38 |         we want to sort by.
 39 |     Returns
 40 |     -------
 41 |     sorted_tensor : Variable(torch.FloatTensor)
 42 |         The original tensor sorted along the batch dimension with respect to sequence_lengths.
 43 |     sorted_sequence_lengths : Variable(torch.LongTensor)
 44 |         The original sequence_lengths sorted by decreasing size.
 45 |     restoration_indices : Variable(torch.LongTensor)
 46 |         Indices into the sorted_tensor such that
 47 |         ``sorted_tensor.index_select(0, restoration_indices) == original_tensor``
 48 |     permuation_index : Variable(torch.LongTensor)
 49 |         The indices used to sort the tensor. This is useful if you want to sort many
 50 |         tensors using the same ordering.
 51 |     """
 52 | 
 53 |     if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable):
 54 |         raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.")
 55 | 
 56 |     sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
 57 |     sorted_tensor = tensor.index_select(0, permutation_index)
 58 | 
 59 |     # This is ugly, but required - we are creating a new variable at runtime, so we
 60 |     # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and
 61 |     # refilling one of the inputs to the function.
 62 |     index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths)))
 63 |     # This is the equivalent of zipping with index, sorting by the original
 64 |     # sequence lengths and returning the now sorted indices.
 65 |     index_range = Variable(index_range.long())
 66 |     _, reverse_mapping = permutation_index.sort(0, descending=False)
 67 |     restoration_indices = index_range.index_select(0, reverse_mapping)
 68 |     return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index
 69 | 
 70 | 
 71 | def get_final_encoder_states(encoder_outputs: torch.Tensor,
 72 |                              mask: torch.Tensor,
 73 |                              bidirectional: bool = False) -> torch.Tensor:
 74 |     """
 75 |     Given the output from a ``Seq2SeqEncoder``, with shape ``(batch_size, sequence_length,
 76 |     encoding_dim)``, this method returns the final hidden state for each element of the batch,
 77 |     giving a tensor of shape ``(batch_size, encoding_dim)``.  This is not as simple as
 78 |     ``encoder_outputs[:, -1]``, because the sequences could have different lengths.  We use the
 79 |     mask (which has shape ``(batch_size, sequence_length)``) to find the final state for each batch
 80 |     instance.
 81 |     Additionally, if ``bidirectional`` is ``True``, we will split the final dimension of the
 82 |     ``encoder_outputs`` into two and assume that the first half is for the forward direction of the
 83 |     encoder and the second half is for the backward direction.  We will concatenate the last state
 84 |     for each encoder dimension, giving ``encoder_outputs[:, -1, :encoding_dim/2]`` concated with
 85 |     ``encoder_outputs[:, 0, encoding_dim/2:]``.
 86 |     """
 87 |     # These are the indices of the last words in the sequences (i.e. length sans padding - 1).  We
 88 |     # are assuming sequences are right padded.
 89 |     # Shape: (batch_size,)
 90 |     last_word_indices = mask.sum(1).long() - 1
 91 |     batch_size, _, encoder_output_dim = encoder_outputs.size()
 92 |     expanded_indices = last_word_indices.view(-1, 1, 1).expand(batch_size, 1, encoder_output_dim)
 93 |     # Shape: (batch_size, 1, encoder_output_dim)
 94 |     final_encoder_output = encoder_outputs.gather(1, expanded_indices)
 95 |     final_encoder_output = final_encoder_output.squeeze(1)  # (batch_size, encoder_output_dim)
 96 |     if bidirectional:
 97 |         final_forward_output = final_encoder_output[:, :(encoder_output_dim // 2)]
 98 |         final_backward_output = encoder_outputs[:, 0, (encoder_output_dim // 2):]
 99 |         final_encoder_output = torch.cat([final_forward_output, final_backward_output], dim=-1)
100 |     return final_encoder_output
101 | 
102 | 
103 | def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable):
104 |     """
105 |     Computes and returns an element-wise dropout mask for a given tensor, where
106 |     each element in the mask is dropped out with probability dropout_probability.
107 |     Note that the mask is NOT applied to the tensor - the tensor is passed to retain
108 |     the correct CUDA tensor type for the mask.
109 |     Parameters
110 |     ----------
111 |     dropout_probability : float, required.
112 |         Probability of dropping a dimension of the input.
113 |     tensor_for_masking : torch.Variable, required.
114 |     Returns
115 |     -------
116 |     A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability).
117 |     This scaling ensures expected values and variances of the output of applying this mask
118 |      and the original tensor are the same.
119 |     """
120 |     binary_mask = tensor_for_masking.clone()
121 |     binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability)
122 |     # Scale mask by 1/keep_prob to preserve output statistics.
123 |     dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
124 |     return dropout_mask
125 |     
126 | def block_orthogonal(tensor: torch.Tensor,
127 |                      split_sizes: List[int],
128 |                      gain: float = 1.0) -> None:
129 |         """
130 |         An initializer which allows initializing ptm parameters in "blocks". This is helpful
131 |         in the case of recurrent models which use multiple gates applied to linear projections,
132 |         which can be computed efficiently if they are concatenated together. However, they are
133 |         separate parameters which should be initialized independently.
134 |         Parameters
135 |         ----------
136 |         tensor : ``torch.Tensor``, required.
137 |             A tensor to initialize.
138 |         split_sizes : List[int], required.
139 |             A list of length ``tensor.ndim()`` specifying the size of the
140 |             blocks along that particular dimension. E.g. ``[10, 20]`` would
141 |             result in the tensor being split into chunks of size 10 along the
142 |             first dimension and 20 along the second.
143 |         gain : float, optional (default = 1.0)
144 |             The gain (scaling) applied to the orthogonal initialization.
145 |         """
146 | 
147 |         if isinstance(tensor, Variable):
148 |         # in pytorch 4.0, Variable equals Tensor
149 |         #    block_orthogonal(tensor.data, split_sizes, gain)
150 |         #else:
151 |             sizes = list(tensor.size())
152 |             if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
153 |                 raise ConfigurationError("tensor dimensions must be divisible by their respective "
154 |                                          "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes))
155 |             indexes = [list(range(0, max_size, split))
156 |                        for max_size, split in zip(sizes, split_sizes)]
157 |             # Iterate over all possible blocks within the tensor.
158 |             for block_start_indices in itertools.product(*indexes):
159 |                 # A list of tuples containing the index to start at for this block
160 |                 # and the appropriate step size (i.e split_size[i] for dimension i).
161 |                 index_and_step_tuples = zip(block_start_indices, split_sizes)
162 |                 # This is a tuple of slices corresponding to:
163 |                 # tensor[index: index + step_size, ...]. This is
164 |                 # required because we could have an arbitrary number
165 |                 # of dimensions. The actual slices we need are the
166 |                 # start_index: start_index + step for each dimension in the tensor.
167 |                 block_slice = tuple([slice(start_index, start_index + step)
168 |                                      for start_index, step in index_and_step_tuples])
169 |                 tensor[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain)
170 | 


--------------------------------------------------------------------------------
/elmoformanylangs/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import unicode_literals
 3 | import collections
 4 | import itertools
 5 | 
 6 | 
 7 | def flatten(lst):
 8 |   return list(itertools.chain.from_iterable(lst))
 9 | 
10 | 
11 | def deep_iter(x):
12 |   if isinstance(x, list) or isinstance(x, tuple):
13 |     for u in x:
14 |       for v in deep_iter(u):
15 |         yield v
16 |   else:
17 |     yield
18 | 
19 | 
20 | def dict2namedtuple(dic):
21 |   return collections.namedtuple('Namespace', dic.keys())(**dic)
22 | 


--------------------------------------------------------------------------------
/gpt/chat.py:
--------------------------------------------------------------------------------
1 | from gpt.chitchat.interact import chitchat
2 | 
3 | while 1:
4 |     text = input('text:')
5 |     r = chitchat(text)
6 |     print(r)
7 | 


--------------------------------------------------------------------------------
/gpt/chitchat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/gpt/chitchat/__init__.py


--------------------------------------------------------------------------------
/gpt/chitchat/config/model_config_dialogue_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "initializer_range": 0.02,
 3 |   "layer_norm_epsilon": 1e-05,
 4 |   "n_ctx": 300,
 5 |   "n_embd": 768,
 6 |   "n_head": 12,
 7 |   "n_layer": 10,
 8 |   "n_positions": 300,
 9 |   "vocab_size": 13317
10 | }


--------------------------------------------------------------------------------
/gpt/chitchat/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/gpt/chitchat/data/.gitkeep


--------------------------------------------------------------------------------
/gpt/chitchat/dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import torch
 3 | 
 4 | 
 5 | class MyDataset(Dataset):
 6 |     """
 7 | 
 8 |     """
 9 | 
10 |     def __init__(self, data_list):
11 |         self.data_list = data_list
12 | 
13 |     def __getitem__(self, index):
14 |         input_ids = self.data_list[index].strip()
15 |         input_ids = [int(token_id) for token_id in input_ids.split()]
16 |         return input_ids
17 | 
18 |     def __len__(self):
19 |         return len(self.data_list)
20 | 


--------------------------------------------------------------------------------
/gpt/chitchat/generate_dialogue_subset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from os.path import join
 3 | import numpy as np
 4 | from collections import Counter
 5 | import matplotlib.pyplot as plt
 6 | from matplotlib.pyplot import MultipleLocator
 7 | 
 8 | 
 9 | def generate_subset():
10 |     """
11 |     用于生成训练子集
12 |     :return:
13 |     """
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料')
16 |     parser.add_argument('--subset_size', default=500000, type=int, required=False, help='要获取的对话数据子集的规模')
17 |     parser.add_argument('--subset_data_path', default='data', type=str, required=False,
18 |                         help='数据子集文件路径,指定文件的父目录')
19 |     args = parser.parse_args()
20 |     with open(args.raw_data_path, "r", encoding="utf8") as f:
21 |         data = f.read()
22 |     dialogues = data.split("\n\n")
23 |     subset_size = min(len(dialogues), args.subset_size)
24 | 
25 |     with open(join(args.subset_data_path, "train_{}w.txt".format(int(subset_size / 10000))), "w", encoding="utf8") as f:
26 |         print("generating subset,please wait a few seconds ")
27 |         for dialogue_index, dialogue in enumerate(dialogues):
28 |             if dialogue_index >= subset_size:
29 |                 break
30 |             for utterance in dialogue.split("\n"):
31 |                 f.writelines(utterance + "\n")
32 |             f.writelines("\n")
33 | 
34 | 
35 | def compute_dialogue_length():
36 |     """
37 |     查看聊天语料中的dialogue的长度分布
38 |     :return:
39 |     """
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料')
42 |     args = parser.parse_args()
43 |     with open(args.raw_data_path, "r", encoding="utf8") as f:
44 |         data = f.read()
45 |     dialogues = data.split("\n\n")
46 |     # 统计各个dialogue的长度
47 |     dialogues_lengths = [len(dialogue.replace("\n", "")) for dialogue in dialogues]
48 |     counter = Counter(dialogues_lengths)  # {label:sum(label)}
49 |     dialogue_length_arr = list(counter)
50 |     num_arr = [counter[element] for element in list(counter)]
51 |     print(counter[300])
52 | 
53 |     x_major_locator = MultipleLocator(100)  # MultipleLocator用于设置刻度间隔
54 |     # y_major_locator = MultipleLocator(20000)
55 |     ax = plt.gca()  # ax为两条坐标轴的实例
56 |     ax.xaxis.set_major_locator(x_major_locator)  # 把x轴的主刻度设置为10的倍数
57 |     # ax.yaxis.set_major_locator(y_major_locator)
58 | 
59 |     plt.xlabel('dialogue length')
60 |     plt.ylabel('number of dialogue')
61 |     # plt.plot(dialogue_length_arr, num_arr, c='green')
62 |     plt.scatter(dialogue_length_arr, num_arr)
63 |     plt.show()
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     compute_dialogue_length()
68 | 


--------------------------------------------------------------------------------
/gpt/chitchat/interact.py:
--------------------------------------------------------------------------------
  1 | import transformers
  2 | import torch
  3 | import argparse
  4 | import logging
  5 | from transformers import GPT2Config, GPT2LMHeadModel
  6 | from transformers import BertTokenizer
  7 | import torch.nn.functional as F
  8 | 
  9 | PAD = '[PAD]'
 10 | pad_id = 0
 11 | 
 12 | 
 13 | def set_interact_args():
 14 |     """
 15 |     Sets up the training arguments.
 16 |     """
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('--device', default='0', type=str, required=False, help='生成设备')
 19 |     parser.add_argument('--temperature', default=1, type=float, required=False, help='生成的temperature')
 20 |     parser.add_argument('--topk', default=8, type=int, required=False, help='最高k选1')
 21 |     parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率')
 22 |     parser.add_argument('--model_config', default='chitchat/config/model_config_dialogue_small.json', type=str, required=False,
 23 |                         help='模型参数')
 24 |     parser.add_argument('--log_path', default='chitchat/data/interacting.log', type=str, required=False, help='interact日志存放位置')
 25 |     parser.add_argument('--voca_path', default='chitchat/vocabulary/vocab_small.txt', type=str, required=False, help='选择词库')
 26 |     parser.add_argument('--dialogue_model_path', default='chitchat/dialogue_model/', type=str, required=False, help='对话模型路径')
 27 |     parser.add_argument('--save_samples_path', default="chitchat/sample/", type=str, required=False, help="保存聊天记录的文件路径")
 28 |     parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False,
 29 |                         help="重复惩罚参数，若生成的对话重复性较高，可适当提高该参数")
 30 |     parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数，以使得训练的结果是确定的')
 31 |     parser.add_argument('--max_len', type=int, default=25, help='每个utterance的最大长度,超过指定长度则进行截断')
 32 |     parser.add_argument('--max_history_len', type=int, default=5, help="dialogue history的最大长度")
 33 |     parser.add_argument('--no_cuda', action='store_true', help='不使用GPU进行预测')
 34 |     return parser.parse_args()
 35 | 
 36 | 
 37 | def create_logger(args):
 38 |     """
 39 |     将日志输出到日志文件和控制台
 40 |     """
 41 |     logger = logging.getLogger(__name__)
 42 |     logger.setLevel(logging.INFO)
 43 | 
 44 |     formatter = logging.Formatter(
 45 |         '%(asctime)s - %(levelname)s - %(message)s')
 46 | 
 47 |     # 创建一个handler，用于写入日志文件
 48 |     file_handler = logging.FileHandler(
 49 |         filename=args.log_path)
 50 |     file_handler.setFormatter(formatter)
 51 |     file_handler.setLevel(logging.INFO)
 52 |     logger.addHandler(file_handler)
 53 | 
 54 |     # 创建一个handler，用于将日志输出到控制台
 55 |     console = logging.StreamHandler()
 56 |     console.setLevel(logging.DEBUG)
 57 |     console.setFormatter(formatter)
 58 |     logger.addHandler(console)
 59 | 
 60 |     return logger
 61 | 
 62 | 
 63 | def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
 64 |     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 65 |         Args:
 66 |             logits: logits distribution shape (vocabulary size)
 67 |             top_k > 0: keep only top k tokens with highest probability (top-k filtering).
 68 |             top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
 69 |                 Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
 70 |         From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
 71 |     """
 72 |     assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
 73 |     top_k = min(top_k, logits.size(-1))  # Safety check
 74 |     if top_k > 0:
 75 |         # Remove all tokens with a probability less than the last token of the top-k
 76 |         # torch.topk()返回最后一维最大的top_k个元素，返回值为二维(values,indices)
 77 |         # ...表示其他维度由计算机自行推断
 78 |         indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
 79 |         logits[indices_to_remove] = filter_value  # 对于topk之外的其他元素的logits值设为负无穷
 80 | 
 81 |     if top_p > 0.0:
 82 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True)  # 对logits进行递减排序
 83 |         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
 84 | 
 85 |         # Remove tokens with cumulative probability above the threshold
 86 |         sorted_indices_to_remove = cumulative_probs > top_p
 87 |         # Shift the indices to the right to keep also the first token above the threshold
 88 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
 89 |         sorted_indices_to_remove[..., 0] = 0
 90 | 
 91 |         indices_to_remove = sorted_indices[sorted_indices_to_remove]
 92 |         logits[indices_to_remove] = filter_value
 93 |     return logits
 94 | 
 95 | 
 96 | args = set_interact_args()
 97 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 98 | tokenizer = BertTokenizer(vocab_file=args.voca_path)
 99 | model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
100 | model.to(device)
101 | model.eval()
102 | 
103 | history = []
104 | 
105 | 
106 | def chitchat(text):
107 |     history.append(tokenizer.encode(text))
108 |     input_ids = []  # 每个input以[CLS]为开头
109 | 
110 |     for history_id, history_utr in enumerate(history[-args.max_history_len:]):
111 |         input_ids.extend(history_utr)
112 |         # input_ids.append(tokenizer.sep_token_id)
113 |     curr_input_tensor = torch.tensor(input_ids).long().to(device)
114 |     generated = []
115 |     # 最多生成max_len个token
116 |     for _ in range(args.max_len):
117 |         outputs = model(input_ids=curr_input_tensor)
118 |         next_token_logits = outputs[0][-1, :]
119 |         # 对于已生成的结果generated中的每个token添加一个重复惩罚项，降低其生成概率
120 |         for id in set(generated):
121 |             next_token_logits[id] /= args.repetition_penalty
122 |         next_token_logits = next_token_logits / args.temperature
123 |         # 对于[UNK]的概率设为无穷小，也就是说模型的预测结果不可能是[UNK]这个token
124 |         next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
125 |         filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp)
126 |         # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素，权重越高，抽到的几率越高，返回元素的下标
127 |         next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
128 |         if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
129 |             break
130 |         generated.append(next_token.item())
131 |         curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0)
132 |         # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist())
133 |         # print("his_text:{}".format(his_text))
134 |     history.append(generated)
135 |     text = tokenizer.convert_ids_to_tokens(generated)
136 |     return ''.join(text)
137 | 
138 | 
139 | if __name__ == '__main__':
140 |     # main()
141 |     while 1:
142 |         text = input('user:')
143 |         print(chitchat(text))
144 | 


--------------------------------------------------------------------------------
/gpt/chitchat/interact_mmi.py:
--------------------------------------------------------------------------------
  1 | import transformers
  2 | import torch
  3 | import os
  4 | import json
  5 | import random
  6 | import numpy as np
  7 | import argparse
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | from datetime import datetime
 10 | from tqdm import tqdm
 11 | from torch.nn import DataParallel
 12 | import logging
 13 | from transformers.modeling_gpt2 import GPT2Config, GPT2LMHeadModel
 14 | from transformers import BertTokenizer
 15 | from os.path import join, exists
 16 | from itertools import zip_longest, chain
 17 | from dataset import MyDataset
 18 | from torch.utils.data import Dataset, DataLoader
 19 | from torch.nn import CrossEntropyLoss
 20 | from sklearn.model_selection import train_test_split
 21 | from train import create_model
 22 | import torch.nn.functional as F
 23 | import copy
 24 | 
 25 | PAD = '[PAD]'
 26 | pad_id = 0
 27 | 
 28 | 
 29 | def set_interact_args():
 30 |     """
 31 |     Sets up the training arguments.
 32 |     """
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument('--device', default='0', type=str, required=False, help='生成设备')
 35 |     parser.add_argument('--temperature', default=1, type=float, required=False, help='生成的temperature')
 36 |     parser.add_argument('--topk', default=8, type=int, required=False, help='最高k选1')
 37 |     parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率')
 38 |     parser.add_argument('--model_config', default='config/model_config_dialogue_small.json', type=str, required=False,
 39 |                         help='模型参数')
 40 |     parser.add_argument('--log_path', default='data/interacting_mmi.log', type=str, required=False,
 41 |                         help='interact_mmi日志存放位置')
 42 |     parser.add_argument('--voca_path', default='vocabulary/vocab_small.txt', type=str, required=False, help='选择词库')
 43 |     parser.add_argument('--dialogue_model_path', default='dialogue_model/', type=str, required=False,
 44 |                         help='dialogue_model路径')
 45 |     parser.add_argument('--mmi_model_path', default='mmi_model/', type=str, required=False,
 46 |                         help='互信息mmi_model路径')
 47 |     parser.add_argument('--save_samples_path', default="sample/", type=str, required=False, help="保存聊天记录的文件路径")
 48 |     parser.add_argument('--repetition_penalty', default=1.0, type=float, required=False,
 49 |                         help="重复惩罚参数，若生成的对话重复性较高，可适当提高该参数")
 50 |     parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数，以使得训练的结果是确定的')
 51 |     parser.add_argument('--max_len', type=int, default=25, help='每个utterance的最大长度,超过指定长度则进行截断')
 52 |     parser.add_argument('--max_history_len', type=int, default=5, help="dialogue history的最大长度")
 53 |     parser.add_argument('--no_cuda', action='store_true', help='不使用GPU进行预测')
 54 |     parser.add_argument('--batch_size', type=int, default=5, help='批量生成response，然后经过MMI模型进行筛选')
 55 |     parser.add_argument('--debug', action='store_true', help='指定该参数，可以查看生成的所有候选的reponse，及其loss')
 56 |     return parser.parse_args()
 57 | 
 58 | 
 59 | def create_logger(args):
 60 |     """
 61 |     将日志输出到日志文件和控制台
 62 |     """
 63 |     logger = logging.getLogger(__name__)
 64 |     logger.setLevel(logging.INFO)
 65 | 
 66 |     formatter = logging.Formatter(
 67 |         '%(asctime)s - %(levelname)s - %(message)s')
 68 | 
 69 |     # 创建一个handler，用于写入日志文件
 70 |     file_handler = logging.FileHandler(
 71 |         filename=args.log_path)
 72 |     file_handler.setFormatter(formatter)
 73 |     file_handler.setLevel(logging.INFO)
 74 |     logger.addHandler(file_handler)
 75 | 
 76 |     # 创建一个handler，用于将日志输出到控制台
 77 |     console = logging.StreamHandler()
 78 |     console.setLevel(logging.DEBUG)
 79 |     console.setFormatter(formatter)
 80 |     logger.addHandler(console)
 81 | 
 82 |     return logger
 83 | 
 84 | 
 85 | def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
 86 |     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 87 |         Args:
 88 |             logits: logits distribution shape (vocabulary size)
 89 |             top_k > 0: keep only top k tokens with highest probability (top-k filtering).
 90 |             top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
 91 |                 Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
 92 |     """
 93 |     assert logits.dim() == 2
 94 |     top_k = min(top_k, logits[0].size(-1))  # Safety check
 95 |     if top_k > 0:
 96 |         # Remove all tokens with a probability less than the last token of the top-k
 97 |         # torch.topk()返回最后一维最大的top_k个元素，返回值为二维(values,indices)
 98 |         # ...表示其他维度由计算机自行推断
 99 |         for logit in logits:
100 |             indices_to_remove = logit < torch.topk(logit, top_k)[0][..., -1, None]
101 |             logit[indices_to_remove] = filter_value  # 对于topk之外的其他元素的logits值设为负无穷
102 | 
103 |     if top_p > 0.0:
104 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)  # 对logits进行递减排序
105 |         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
106 | 
107 |         # Remove tokens with cumulative probability above the threshold
108 |         sorted_indices_to_remove = cumulative_probs > top_p
109 |         # Shift the indices to the right to keep also the first token above the threshold
110 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
111 |         sorted_indices_to_remove[..., 0] = 0
112 |         for index, logit in enumerate(logits):
113 |             indices_to_remove = sorted_indices[index][sorted_indices_to_remove[index]]
114 |             logit[indices_to_remove] = filter_value
115 |     return logits
116 | 
117 | 
118 | def main():
119 |     args = set_interact_args()
120 |     logger = create_logger(args)
121 |     # 当用户使用GPU,并且GPU可用时
122 |     args.cuda = torch.cuda.is_available() and not args.no_cuda
123 |     device = 'cuda' if args.cuda else 'cpu'
124 |     logger.info('using device:{}'.format(device))
125 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.device
126 |     tokenizer = BertTokenizer(vocab_file=args.voca_path)
127 |     # 对话model
128 |     dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
129 |     dialogue_model.to(device)
130 |     dialogue_model.eval()
131 |     # 互信息mmi model
132 |     mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path)
133 |     mmi_model.to(device)
134 |     mmi_model.eval()
135 |     if args.save_samples_path:
136 |         if not os.path.exists(args.save_samples_path):
137 |             os.makedirs(args.save_samples_path)
138 |         samples_file = open(args.save_samples_path + '/mmi_samples.txt', 'a', encoding='utf8')
139 |         samples_file.write("聊天记录{}:\n".format(datetime.now()))
140 |         # 存储聊天记录，每个utterance以token的id的形式进行存储
141 |     history = []
142 |     print('开始和chatbot聊天，输入CTRL + Z以退出')
143 | 
144 |     while True:
145 |         try:
146 |             text = input("user:")
147 |             if args.save_samples_path:
148 |                 samples_file.write("user:{}\n".format(text))
149 |             history.append(tokenizer.encode(text))
150 |             input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
151 |             for history_id, history_utr in enumerate(history[-args.max_history_len:]):
152 |                 input_ids.extend(history_utr)
153 |                 input_ids.append(tokenizer.sep_token_id)
154 |             # 用于批量生成response，维度为(batch_size,token_len)
155 |             input_ids = [copy.deepcopy(input_ids) for _ in range(args.batch_size)]
156 | 
157 |             curr_input_tensors = torch.tensor(input_ids).long().to(device)
158 |             generated = []  # 二维数组，维度为(生成的response的最大长度，batch_size)，generated[i,j]表示第j个response的第i个token的id
159 |             finish_set = set()  # 标记是否所有response均已生成结束，若第i个response生成结束，即生成了sep_token_id，则将i放入finish_set
160 |             # 最多生成max_len个token
161 |             for _ in range(args.max_len):
162 |                 outputs = dialogue_model(input_ids=curr_input_tensors)
163 |                 next_token_logits = outputs[0][:, -1, :]
164 |                 # 对于已生成的结果generated中的每个token添加一个重复惩罚项，降低其生成概率
165 |                 for index in range(args.batch_size):
166 |                     for token_id in set([token_ids[index] for token_ids in generated]):
167 |                         next_token_logits[index][token_id] /= args.repetition_penalty
168 |                 next_token_logits = next_token_logits / args.temperature
169 |                 # 对于[UNK]的概率设为无穷小，也就是说模型的预测结果不可能是[UNK]这个token
170 |                 for next_token_logit in next_token_logits:
171 |                     next_token_logit[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
172 |                 filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp)
173 |                 # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素，权重越高，抽到的几率越高，返回元素的下标
174 |                 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
175 |                 # 判断是否有response生成了[SEP],将已生成了[SEP]的resposne进行标记
176 |                 for index, token_id in enumerate(next_token[:, 0]):
177 |                     if token_id == tokenizer.sep_token_id:
178 |                         finish_set.add(index)
179 |                 # 检验是否所有的response均已生成[SEP]
180 |                 finish_flag = True  # 是否所有的response均已生成[SEP]的token
181 |                 for index in range(args.batch_size):
182 |                     if index not in finish_set:  # response批量生成未完成
183 |                         finish_flag = False
184 |                         break
185 |                 if finish_flag:
186 |                     break
187 |                 generated.append([token.item() for token in next_token[:, 0]])
188 |                 # 将新生成的token与原来的token进行拼接
189 |                 curr_input_tensors = torch.cat((curr_input_tensors, next_token), dim=-1)
190 |             candidate_responses = []  # 生成的所有候选response
191 |             for batch_index in range(args.batch_size):
192 |                 response = []
193 |                 for token_index in range(len(generated)):
194 |                     if generated[token_index][batch_index] != tokenizer.sep_token_id:
195 |                         response.append(generated[token_index][batch_index])
196 |                     else:
197 |                         break
198 |                 candidate_responses.append(response)
199 | 
200 |             # mmi模型的输入
201 |             if args.debug:
202 |                 print("candidate response:")
203 |             samples_file.write("candidate response:\n")
204 |             min_loss = float('Inf')
205 |             best_response = ""
206 |             for response in candidate_responses:
207 |                 mmi_input_id = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
208 |                 mmi_input_id.extend(response)
209 |                 mmi_input_id.append(tokenizer.sep_token_id)
210 |                 for history_utr in reversed(history[-args.max_history_len:]):
211 |                     mmi_input_id.extend(history_utr)
212 |                     mmi_input_id.append(tokenizer.sep_token_id)
213 |                 mmi_input_tensor = torch.tensor(mmi_input_id).long().to(device)
214 |                 out = mmi_model(input_ids=mmi_input_tensor, labels=mmi_input_tensor)
215 |                 loss = out[0].item()
216 |                 if args.debug:
217 |                     text = tokenizer.convert_ids_to_tokens(response)
218 |                     print("{} loss:{}".format("".join(text), loss))
219 |                 samples_file.write("{} loss:{}\n".format("".join(text), loss))
220 |                 if loss < min_loss:
221 |                     best_response = response
222 |                     min_loss = loss
223 |             history.append(best_response)
224 |             text = tokenizer.convert_ids_to_tokens(best_response)
225 |             print("chatbot:" + "".join(text))
226 |             if args.save_samples_path:
227 |                 samples_file.write("chatbot:{}\n".format("".join(text)))
228 |         except KeyboardInterrupt:
229 |             if args.save_samples_path:
230 |                 samples_file.close()
231 |             break
232 | 
233 | 
234 | if __name__ == '__main__':
235 |     main()
236 | 


--------------------------------------------------------------------------------
/gpt/gpt_lyric.py:
--------------------------------------------------------------------------------
1 | from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
2 | 
3 | tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-lyric")
4 | model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-lyric")
5 | 
6 | text_generator = TextGenerationPipeline(model, tokenizer)
7 | res = text_generator("最美的不是下雨天", max_length=100, do_sample=True)
8 | print(res)
9 | 


--------------------------------------------------------------------------------
/pseudo/first_stage.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import jieba
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | from torch.utils.data import DataLoader, TensorDataset
  7 | from sklearn.metrics import accuracy_score
  8 | from utils import fix_seed
  9 | import pandas as pd
 10 | 
 11 | 
 12 | class TextCLS(torch.nn.Module):
 13 |     # 准备我们需要用到的参数和layer
 14 |     def __init__(self,
 15 |                  vocab_size,
 16 |                  embedding_size):
 17 |         super().__init__()
 18 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 19 |         # [batch_size, seq_len, hidden_size]
 20 |         self.lstm = nn.LSTM(input_size=embedding_size,
 21 |                             hidden_size=256,
 22 |                             num_layers=2,
 23 |                             batch_first=True)
 24 |         self.dense1 = nn.Linear(256, 100)
 25 |         self.dense2 = nn.Linear(100, 5)
 26 | 
 27 |     # 前向传播，那我们准备好的layer拼接在一起
 28 |     def forward(self, x):
 29 |         embedding = self.embedding(x)
 30 |         # [batch_size, seq_len, hidden_size]
 31 |         out, _ = self.lstm(embedding)
 32 |         out = self.dense1(out[:, -1, :])
 33 |         out = self.dense2(out)
 34 |         return out
 35 | 
 36 | 
 37 | def tokenize(string):
 38 |     res = list(jieba.cut(string, cut_all=False))
 39 |     return res
 40 | 
 41 | 
 42 | # 把数据转换成index
 43 | def seq2index(seq, vocab):
 44 |     seg = tokenize(seq)
 45 |     seg_index = []
 46 |     for s in seg:
 47 |         seg_index.append(vocab.get(s, 1))
 48 |     return seg_index
 49 | 
 50 | 
 51 | # 统一长度
 52 | def padding_seq(X, max_len=10):
 53 |     return np.array([
 54 |         np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
 55 |     ])
 56 | 
 57 | 
 58 | def load_data(batch_size=32):
 59 |     df = pd.read_csv('../data/tnews_public/train.csv')
 60 |     train_text = df['text'].values
 61 |     train_label = df['label'].values
 62 | 
 63 |     df = pd.read_csv('../data/tnews_public/dev.csv')
 64 |     dev_text = df['text'].values
 65 |     dev_label = df['label'].values
 66 | 
 67 |     # 生成词典
 68 |     segment = [tokenize(t) for t in train_text]
 69 | 
 70 |     word_frequency = defaultdict(int)
 71 |     for row in segment:
 72 |         for i in row:
 73 |             word_frequency[i] += 1
 74 | 
 75 |     word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序
 76 | 
 77 |     vocab = {'[PAD]': 0, '[UNK]': 1}
 78 |     for d in word_sort:
 79 |         vocab[d[0]] = len(vocab)
 80 | 
 81 |     train_x = padding_seq([seq2index(t, vocab) for t in train_text])
 82 |     train_y = np.array(train_label)
 83 |     train_data_set = TensorDataset(torch.from_numpy(train_x),
 84 |                                    torch.from_numpy(train_y))
 85 |     train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size)
 86 | 
 87 |     dev_x = padding_seq([seq2index(t, vocab) for t in dev_text])
 88 |     dev_y = np.array(dev_label)
 89 |     dev_data_set = TensorDataset(torch.from_numpy(dev_x),
 90 |                                  torch.from_numpy(dev_y))
 91 |     dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size)
 92 | 
 93 |     return train_data_loader, dev_data_loader, vocab
 94 | 
 95 | 
 96 | def pseudo_data(model, data):
 97 |     pseudo = []
 98 |     pseudo_label = []
 99 |     for step, (b_x, b_y) in enumerate(data):
100 |         if torch.cuda.is_available():
101 |             b_x = b_x.cuda().long()
102 |         with torch.no_grad():
103 |             # logits
104 |             output = model(b_x)
105 |         pred = torch.argmax(output, dim=1)
106 |         # 拿到对应的置信度
107 |         out = torch.softmax(output, dim=1)
108 | 
109 |         for i, (p, o) in enumerate(zip(pred, out)):
110 |             if o[p] > 0.95:
111 |                 index = step * 128 + i
112 |                 pseudo.append(index)
113 |                 pseudo_label.append(p.item())
114 |     df = pd.read_csv('../data/tnews_public/dev.csv')
115 |     dev_text = df['text'].values
116 |     pseudo_text = dev_text[pseudo]
117 |     df = pd.DataFrame({'text': pseudo_text, 'label': pseudo_label})
118 |     df.to_csv('pseudo.csv', index=False, encoding='utf_8_sig')
119 | 
120 | 
121 | # 训练模型
122 | def train():
123 |     fix_seed()
124 | 
125 |     train_data_loader, dev_data_loader, vocab = load_data(128)
126 |     model = TextCLS(vocab_size=len(vocab),
127 |                     embedding_size=100)
128 | 
129 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
130 |     loss_func = nn.CrossEntropyLoss()
131 | 
132 |     if torch.cuda.is_available():
133 |         model = model.cuda()
134 | 
135 |     for epoch in range(5):
136 |         print('epoch:', epoch + 1)
137 |         pred = []
138 |         label = []
139 |         for step, (b_x, b_y) in enumerate(train_data_loader):
140 |             if torch.cuda.is_available():
141 |                 b_x = b_x.cuda().long()
142 |                 b_y = b_y.cuda().long()
143 |             output = model(b_x)
144 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
145 |             label.extend(b_y.cpu().numpy())
146 |             loss = loss_func(output, b_y)
147 |             optimizer.zero_grad()
148 |             # 求解梯度
149 |             loss.backward()
150 |             # 更新我们的权重
151 |             optimizer.step()
152 |         acc = accuracy_score(pred, label)
153 |         print('train acc:', acc)
154 | 
155 |         pred = []
156 |         label = []
157 |         for step, (b_x, b_y) in enumerate(dev_data_loader):
158 |             if torch.cuda.is_available():
159 |                 b_x = b_x.cuda().long()
160 |                 b_y = b_y.cuda().long()
161 |             with torch.no_grad():
162 |                 output = model(b_x)
163 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
164 |             label.extend(b_y.cpu().numpy())
165 |         acc = accuracy_score(pred, label)
166 |         print('dev acc:', acc)
167 |         print()
168 | 
169 |     pseudo_data(model, dev_data_loader)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     train()
174 | 


--------------------------------------------------------------------------------
/pseudo/second_stage.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import jieba
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | from torch.utils.data import DataLoader, TensorDataset
  7 | from sklearn.metrics import accuracy_score
  8 | from utils import fix_seed
  9 | import pandas as pd
 10 | 
 11 | 
 12 | class TextCLS(torch.nn.Module):
 13 |     # 准备我们需要用到的参数和layer
 14 |     def __init__(self,
 15 |                  vocab_size,
 16 |                  embedding_size):
 17 |         super().__init__()
 18 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 19 |         # [batch_size, seq_len, hidden_size]
 20 |         self.lstm = nn.LSTM(input_size=embedding_size,
 21 |                             hidden_size=256,
 22 |                             num_layers=2,
 23 |                             batch_first=True)
 24 |         self.dense1 = nn.Linear(256, 100)
 25 |         self.dense2 = nn.Linear(100, 5)
 26 | 
 27 |     # 前向传播，那我们准备好的layer拼接在一起
 28 |     def forward(self, x):
 29 |         embedding = self.embedding(x)
 30 |         # [batch_size, seq_len, hidden_size]
 31 |         out, _ = self.lstm(embedding)
 32 |         out = self.dense1(out[:, -1, :])
 33 |         out = self.dense2(out)
 34 |         return out
 35 | 
 36 | 
 37 | def tokenize(string):
 38 |     res = list(jieba.cut(string, cut_all=False))
 39 |     return res
 40 | 
 41 | 
 42 | # 把数据转换成index
 43 | def seq2index(seq, vocab):
 44 |     seg = tokenize(seq)
 45 |     seg_index = []
 46 |     for s in seg:
 47 |         seg_index.append(vocab.get(s, 1))
 48 |     return seg_index
 49 | 
 50 | 
 51 | # 统一长度
 52 | def padding_seq(X, max_len=10):
 53 |     return np.array([
 54 |         np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
 55 |     ])
 56 | 
 57 | 
 58 | def load_data(batch_size=32):
 59 |     df = pd.read_csv('../data/tnews_public/train.csv')
 60 |     train_text = df['text'].values
 61 |     train_label = df['label'].values
 62 | 
 63 |     df = pd.read_csv('../data/tnews_public/dev.csv')
 64 |     df2 = pd.read_csv('pseudo.csv')
 65 |     df = df.append(df2)
 66 |     dev_text = df['text'].values
 67 |     dev_label = df['label'].values
 68 | 
 69 |     # 生成词典
 70 |     segment = [tokenize(t) for t in train_text]
 71 | 
 72 |     word_frequency = defaultdict(int)
 73 |     for row in segment:
 74 |         for i in row:
 75 |             word_frequency[i] += 1
 76 | 
 77 |     word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序
 78 | 
 79 |     vocab = {'[PAD]': 0, '[UNK]': 1}
 80 |     for d in word_sort:
 81 |         vocab[d[0]] = len(vocab)
 82 | 
 83 |     train_x = padding_seq([seq2index(t, vocab) for t in train_text])
 84 |     train_y = np.array(train_label)
 85 |     train_data_set = TensorDataset(torch.from_numpy(train_x),
 86 |                                    torch.from_numpy(train_y))
 87 |     train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size)
 88 | 
 89 |     dev_x = padding_seq([seq2index(t, vocab) for t in dev_text])
 90 |     dev_y = np.array(dev_label)
 91 |     dev_data_set = TensorDataset(torch.from_numpy(dev_x),
 92 |                                  torch.from_numpy(dev_y))
 93 |     dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size)
 94 | 
 95 |     return train_data_loader, dev_data_loader, vocab
 96 | 
 97 | 
 98 | def pseudo_data(model, data):
 99 |     pseudo = []
100 |     pseudo_label = []
101 |     for step, (b_x, b_y) in enumerate(data):
102 |         if torch.cuda.is_available():
103 |             b_x = b_x.cuda().long()
104 |         with torch.no_grad():
105 |             output = model(b_x)
106 |         pred = torch.argmax(output, dim=1)
107 |         out = torch.softmax(output, dim=1)
108 |         for i, (p, o) in enumerate(zip(pred, out)):
109 |             if o[p] > 0.95:
110 |                 index = step * 128 + i
111 |                 pseudo.append(index)
112 |                 pseudo_label.append(b_y[i])
113 |     df = pd.read_csv('../data/tnews_public/dev.csv')
114 |     dev_text = df['text'].values
115 |     pseudo_text = dev_text[pseudo]
116 |     df = pd.DataFrame({'text': pseudo_text, 'label': pseudo_label})
117 |     df.to_csv('pseudo.csv', index=False, encoding='utf_8_sig')
118 | 
119 | 
120 | # 训练模型
121 | def train():
122 |     fix_seed()
123 | 
124 |     train_data_loader, dev_data_loader, vocab = load_data(128)
125 |     model = TextCLS(vocab_size=len(vocab),
126 |                     embedding_size=100)
127 | 
128 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
129 |     loss_func = nn.CrossEntropyLoss()
130 | 
131 |     if torch.cuda.is_available():
132 |         model = model.cuda()
133 | 
134 |     for epoch in range(5):
135 |         print('epoch:', epoch + 1)
136 |         pred = []
137 |         label = []
138 |         for step, (b_x, b_y) in enumerate(train_data_loader):
139 |             if torch.cuda.is_available():
140 |                 b_x = b_x.cuda().long()
141 |                 b_y = b_y.cuda().long()
142 |             output = model(b_x)
143 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
144 |             label.extend(b_y.cpu().numpy())
145 |             loss = loss_func(output, b_y)
146 |             optimizer.zero_grad()
147 |             # 求解梯度
148 |             loss.backward()
149 |             # 更新我们的权重
150 |             optimizer.step()
151 |         acc = accuracy_score(pred, label)
152 |         print('train acc:', acc)
153 | 
154 |         pred = []
155 |         label = []
156 |         for step, (b_x, b_y) in enumerate(dev_data_loader):
157 |             if torch.cuda.is_available():
158 |                 b_x = b_x.cuda().long()
159 |                 b_y = b_y.cuda().long()
160 |             with torch.no_grad():
161 |                 output = model(b_x)
162 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
163 |             label.extend(b_y.cpu().numpy())
164 |         acc = accuracy_score(pred, label)
165 |         print('dev acc:', acc)
166 |         print()
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     train()
171 | 


--------------------------------------------------------------------------------
/ptm/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/ptm/.gitkeep


--------------------------------------------------------------------------------
/ptm/post train_bert.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer, BertForMaskedLM
 2 | from torch.utils.data import DataLoader, Dataset
 3 | from tqdm import tqdm
 4 | from utils import fix_seed
 5 | from annlp import ptm_path
 6 | import torch
 7 | from utils import random_mask
 8 | import pandas as pd
 9 | 
10 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
11 | path = ptm_path('roberta')
12 | tokenizer = BertTokenizer.from_pretrained(path)
13 | 
14 | 
15 | class BaseDataset(Dataset):
16 |     def __init__(self, encodings, labels=None):
17 |         self.encodings = encodings
18 |         self.labels = labels
19 | 
20 |     def __getitem__(self, idx):
21 |         item = {key: val[idx] for key, val in self.encodings.items()}
22 |         return item
23 | 
24 |     def __len__(self):
25 |         return len(self.encodings['source'])
26 | 
27 | 
28 | def load_data(file_name, batch_size):
29 |     df = pd.read_csv(file_name)
30 |     encoding = tokenizer(text=df['text'].tolist(),
31 |                          return_tensors='np',
32 |                          truncation=True,
33 |                          padding='max_length',
34 |                          max_length=10)
35 |     sources = []
36 |     targets = []
37 |     for input_ids in encoding['input_ids']:
38 |         source, target = random_mask(input_ids, tokenizer)
39 |         sources.append(source)
40 |         targets.append(target)
41 | 
42 |     data = {'source': torch.Tensor(sources),
43 |             'attention_mask': encoding['attention_mask'],
44 |             'target': torch.Tensor(targets)}
45 |     data_loader = DataLoader(BaseDataset(data),
46 |                              batch_size,
47 |                              pin_memory=True if torch.cuda.is_available() else False,
48 |                              shuffle=False)
49 |     return data_loader
50 | 
51 | 
52 | # 训练模型
53 | def train():
54 |     fix_seed()
55 | 
56 |     train_data_loader = load_data('../data/tnews_public/train.csv', batch_size=32)
57 |     dev_data_loader = load_data('../data/tnews_public/dev.csv', batch_size=32)
58 | 
59 |     model = BertForMaskedLM.from_pretrained(path)
60 |     model = model.to(device)
61 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-6)
62 | 
63 |     min_loss = 1000
64 |     for epoch in range(5):
65 |         print('epoch:', epoch + 1)
66 |         pbar = tqdm(train_data_loader)
67 |         for data in pbar:
68 |             optimizer.zero_grad()
69 | 
70 |             input_ids = data['source'].to(device)
71 |             attention_mask = data['attention_mask'].to(device)
72 |             labels = data['target'].to(device).long()
73 |             outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels)
74 |             loss = outputs.loss
75 |             loss.backward()
76 |             optimizer.step()
77 | 
78 |             pbar.update()
79 |             pbar.set_description(f'loss:{loss.item():.4f}')
80 | 
81 |         dev_loss = 0
82 |         for data in tqdm(dev_data_loader):
83 |             input_ids = data['source'].to(device)
84 |             attention_mask = data['attention_mask'].to(device)
85 |             labels = data['target'].to(device).long()
86 |             with torch.no_grad():
87 |                 outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels)
88 |             dev_loss += outputs.loss.item()
89 |         print('dev loss:', dev_loss / len(dev_data_loader))
90 |         print()
91 | 
92 |         if min_loss > dev_loss:
93 |             min_loss = dev_loss
94 |             torch.save(model, 'model.bin')
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     train()
99 | 


--------------------------------------------------------------------------------
/ptm/post train_gpt.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer, GPT2LMHeadModel
 2 | from torch.utils.data import DataLoader, Dataset
 3 | from tqdm import tqdm
 4 | from utils import fix_seed
 5 | import torch
 6 | import pandas as pd
 7 | 
 8 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 9 | path = 'E:\\ptm\\gpt'
10 | tokenizer = BertTokenizer.from_pretrained(path)
11 | 
12 | 
13 | class BaseDataset(Dataset):
14 |     def __init__(self, encodings, labels=None):
15 |         self.encodings = encodings
16 |         self.labels = labels
17 | 
18 |     def __getitem__(self, idx):
19 |         item = {key: val[idx] for key, val in self.encodings.items()}
20 |         return item
21 | 
22 |     def __len__(self):
23 |         return len(self.encodings['source'])
24 | 
25 | 
26 | def load_data(file_name, batch_size):
27 |     df = pd.read_csv(file_name)
28 |     encoding = tokenizer(text=df['text'].tolist(),
29 |                          return_tensors='np',
30 |                          truncation=True,
31 |                          padding='max_length',
32 |                          max_length=10)
33 |     sources = []
34 |     targets = []
35 |     for input_ids in encoding['input_ids']:
36 |         sources.append(input_ids[0:-1])
37 |         targets.append(input_ids[1:])
38 | 
39 |     # [101, 1, 2, 3, 102]
40 |     # source:[101,1,2,3]
41 |     # target:[1,2,3,102]
42 | 
43 |     data = {'source': torch.Tensor(sources),
44 |             'attention_mask': torch.Tensor([mask[:-1] for mask in encoding['attention_mask']]),
45 |             'target': torch.Tensor(targets)}
46 |     data_loader = DataLoader(BaseDataset(data),
47 |                              batch_size,
48 |                              pin_memory=True if torch.cuda.is_available() else False,
49 |                              shuffle=False)
50 |     return data_loader
51 | 
52 | 
53 | # 训练模型
54 | def train():
55 |     fix_seed()
56 | 
57 |     train_data_loader = load_data('../data/tnews_public/train.csv', batch_size=32)
58 |     dev_data_loader = load_data('../data/tnews_public/dev.csv', batch_size=32)
59 | 
60 |     model = GPT2LMHeadModel.from_pretrained(path)
61 |     model = model.to(device)
62 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-6)
63 | 
64 |     for epoch in range(5):
65 |         print('epoch:', epoch + 1)
66 |         pbar = tqdm(train_data_loader)
67 |         for data in pbar:
68 |             optimizer.zero_grad()
69 | 
70 |             input_ids = data['source'].to(device)
71 |             attention_mask = data['attention_mask'].to(device)
72 |             labels = data['target'].to(device).long()
73 |             outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels)
74 |             loss = outputs.loss
75 |             loss.backward()
76 |             optimizer.step()
77 | 
78 |             pbar.update()
79 |             pbar.set_description(f'loss:{loss.item():.4f}')
80 | 
81 |         dev_loss = 0
82 |         for data in tqdm(dev_data_loader):
83 |             input_ids = data['source'].to(device)
84 |             attention_mask = data['attention_mask'].to(device)
85 |             labels = data['target'].to(device).long()
86 |             with torch.no_grad():
87 |                 outputs = model(input_ids.long(), attention_mask=attention_mask, labels=labels)
88 |             dev_loss += outputs.loss.item()
89 |         print('dev loss:', dev_loss / len(dev_data_loader))
90 |         print()
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     train()
95 | 


--------------------------------------------------------------------------------
/ptm/train_bert.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertForMaskedLM, BertTokenizer, \
 2 |     BertPreTrainedModel, BertForSequenceClassification, BertModel
 3 | import torch
 4 | import pandas as pd
 5 | from utils import random_mask
 6 | from torch.utils.data import DataLoader, Dataset
 7 | from tqdm import tqdm
 8 | 
 9 | path = 'E:\\ptm\\roberta'
10 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
11 | model = BertForMaskedLM.from_pretrained(path)
12 | model = model.to(device)
13 | tokenizer = BertTokenizer.from_pretrained(path)
14 | 
15 | 
16 | class BaseDataSet(Dataset):
17 |     def __init__(self, encoding):
18 |         self.encoding = encoding
19 | 
20 |     def __len__(self):
21 |         return len(self.encoding['source'])
22 | 
23 |     def __getitem__(self, ids):
24 |         item = {k: v[ids] for k, v in self.encoding.items()}
25 |         return item
26 | 
27 | 
28 | def load_data(file_name, batch_size):
29 |     df = pd.read_csv(file_name)
30 |     encoding = tokenizer(df['text'].tolist(),
31 |                          return_tensors='np',
32 |                          truncation=True,
33 |                          padding='max_length',
34 |                          max_length=10)
35 |     sources = []
36 |     targets = []
37 |     for input_ids in encoding['input_ids']:
38 |         source, target = random_mask(input_ids, tokenizer)
39 |         sources.append(source)
40 |         targets.append(target)
41 |     data = {'source': torch.Tensor(sources),
42 |             'attention_mask': encoding['attention_mask'],
43 |             'target': torch.Tensor(targets)}
44 | 
45 |     data_loader = DataLoader(BaseDataSet(data), batch_size=batch_size)
46 |     return data_loader
47 | 
48 | 
49 | def train():
50 |     bs = 32
51 |     train_data = load_data('../data/tnews_public/train.csv', batch_size=bs)
52 |     dev_data = load_data('../data/tnews_public/dev.csv', batch_size=bs)
53 | 
54 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-6)
55 | 
56 |     for epoch in range(3):
57 |         pbar = tqdm(train_data)
58 |         for data in pbar:
59 |             optimizer.zero_grad()
60 | 
61 |             input_ids = data['source'].to(device)
62 |             attention_mask = data['attention_mask'].to(device)
63 |             labels = data['target'].to(device)
64 | 
65 |             outputs = model(input_ids.long(), attention_mask, labels=labels.long())
66 | 
67 |             loss = outputs.loss
68 |             loss.backward()
69 |             optimizer.step()
70 | 
71 |             pbar.update()
72 |             pbar.set_description(f'loss:{loss.item():.4f}')
73 | 
74 |         dev_loss = 0
75 |         for data in tqdm(dev_data):
76 |             input_ids = data['source'].to(device)
77 |             attention_mask = data['attention_mask'].to(device)
78 |             labels = data['target'].to(device)
79 |             with torch.no_grad():
80 |                 outputs = model(input_ids.long(), attention_mask, labels=labels.long())
81 |             dev_loss += outputs.loss.item()
82 |         print('dev loss:', dev_loss / len(dev_data))
83 | 
84 |         torch.save(model, 'model.bin')
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     train()
89 | 


--------------------------------------------------------------------------------
/rank/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from utils import cos_sim
 3 | import numpy as np
 4 | import jieba
 5 | from collections import Counter
 6 | from rank import rank
 7 | from text_representation.sentence_embedding import SentenceEmbedding
 8 | 
 9 | model = SentenceEmbedding()
10 | data = pd.read_csv('../data/rank/qa_data.csv')
11 | question = data['question'].values
12 | embedding = model.encode(data['question'].tolist())
13 | 
14 | 
15 | class BM25:
16 |     def __init__(self, documents_list, k1=2, k2=1, b=0.75):
17 |         self.documents_list = documents_list
18 |         self.documents_number = len(documents_list)
19 |         self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number
20 |         self.f = []
21 |         self.idf = {}
22 |         self.k1 = k1
23 |         self.k2 = k2
24 |         self.b = b
25 |         self.init()
26 | 
27 |     def init(self):
28 |         df = {}
29 |         for document in self.documents_list:
30 |             temp = {}
31 |             for word in document:
32 |                 temp[word] = temp.get(word, 0) + 1
33 |             self.f.append(temp)
34 |             for key in temp.keys():
35 |                 df[key] = df.get(key, 0) + 1
36 |         for key, value in df.items():
37 |             self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5))
38 | 
39 |     def get_score(self, index, query):
40 |         score = 0.0
41 |         document_len = len(self.f[index])
42 |         qf = Counter(query)
43 |         for q in query:
44 |             if q not in self.f[index]:
45 |                 continue
46 |             score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / (
47 |                     self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * (
48 |                              qf[q] * (self.k2 + 1) / (qf[q] + self.k2))
49 | 
50 |         return score
51 | 
52 |     def get_documents_score(self, query):
53 |         query = list(jieba.cut(query))
54 |         score_list = []
55 |         for i in range(self.documents_number):
56 |             score_list.append(self.get_score(i, query))
57 |         return score_list
58 | 
59 | 
60 | question_seg = []
61 | for q in question:
62 |     seg = list(jieba.cut(q))
63 |     question_seg.append(seg)
64 | bm = BM25(question_seg)
65 | 
66 | 
67 | def word_recall(text):
68 |     score = bm.get_documents_score(text)
69 |     index = np.argsort(-np.array(score))[:10]
70 |     candidate = question[index]
71 |     return candidate
72 | 
73 | 
74 | def embedding_recall(text):
75 |     e = model.encode(text)
76 |     sim = cos_sim(e, embedding)[0]
77 |     index = np.argsort(-sim)[:10]
78 |     candidate = question[index]
79 |     return candidate
80 | 
81 | 
82 | if __name__ == '__main__':
83 | 
84 |     while 1:
85 |         text = input('text:')
86 |         res1 = list(embedding_recall(text))
87 |         print(res1)
88 |         res2 = list(word_recall(text))
89 |         print(res2)
90 |         res1.extend(res2)
91 |         recall_data = list(set(res1))
92 |         rank(text, recall_data)
93 | 


--------------------------------------------------------------------------------
/rank/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.utils.checkpoint
 4 | from typing import Optional
 5 | from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
 6 | from pytorchltr.loss import LambdaNDCGLoss1, PairwiseLogisticLoss
 7 | 
 8 | 
 9 | class BertForNDCG(BertPreTrainedModel):
10 |     def __init__(self, config):
11 |         super().__init__(config)
12 |         self.num_labels = config.num_labels
13 |         self.config = config
14 | 
15 |         self.bert = BertModel(config)
16 |         classifier_dropout = (
17 |             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
18 |         )
19 |         self.dropout = nn.Dropout(classifier_dropout)
20 |         self.dense = nn.Linear(config.hidden_size, 1)
21 |         self.post_init()
22 | 
23 |     def forward(
24 |             self,
25 |             input_ids: Optional[torch.Tensor] = None,
26 |             attention_mask: Optional[torch.Tensor] = None,
27 |             token_type_ids: Optional[torch.Tensor] = None,
28 |             position_ids: Optional[torch.Tensor] = None,
29 |             head_mask: Optional[torch.Tensor] = None,
30 |             inputs_embeds: Optional[torch.Tensor] = None,
31 |             output_attentions: Optional[bool] = None,
32 |             output_hidden_states: Optional[bool] = None,
33 |             return_dict: Optional[bool] = None,
34 |             num=None):
35 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
36 | 
37 |         outputs = self.bert(
38 |             input_ids,
39 |             attention_mask=attention_mask,
40 |             token_type_ids=token_type_ids,
41 |             position_ids=position_ids,
42 |             head_mask=head_mask,
43 |             inputs_embeds=inputs_embeds,
44 |             output_attentions=output_attentions,
45 |             output_hidden_states=output_hidden_states,
46 |             return_dict=return_dict,
47 |         )
48 | 
49 |         pooled_output = outputs[1]
50 | 
51 |         pooled_output = self.dropout(pooled_output)
52 |         logits = self.dense(pooled_output)
53 | 
54 |         if num is None:
55 |             return logits
56 | 
57 |         loss_fct = LambdaNDCGLoss1()
58 | 
59 |         score = logits.view(-1, num)
60 |         batch = score.shape[0]
61 | 
62 |         label = torch.arange(5, 0, -1).squeeze(0).repeat(batch, 1).to(logits.device)
63 |         n = torch.Tensor([num] * batch).to(logits.device)
64 |         loss = loss_fct(score, label, n)
65 | 
66 |         loss = torch.mean(loss)
67 |         return loss, score
68 | 


--------------------------------------------------------------------------------
/rank/rank.py:
--------------------------------------------------------------------------------
 1 | from utils import get_device
 2 | import torch
 3 | from transformers import BertTokenizer
 4 | from model import BertForNDCG
 5 | import numpy as np
 6 | 
 7 | device = get_device()
 8 | model_path = 'E:\\ptm\\roberta'
 9 | 
10 | tokenizer = BertTokenizer.from_pretrained(model_path)
11 | model = BertForNDCG.from_pretrained(model_path)
12 | model.load_state_dict(torch.load('best_model.bin', map_location=device))
13 | model.to(device)
14 | model = model.eval()
15 | 
16 | 
17 | def inference(text1, text2):
18 |     encoding = tokenizer([text1] * len(text2),
19 |                          text2,
20 |                          max_length=128,
21 |                          truncation=True,
22 |                          padding=True,
23 |                          return_tensors='pt')
24 | 
25 |     with torch.no_grad():
26 |         res = model(**encoding.to(device))
27 |         logits = res.cpu().numpy().flatten()
28 |     return logits
29 | 
30 | 
31 | def rank(query, docunment):
32 |     res = inference(query, docunment)
33 |     index = np.argsort(-res)[:5]
34 |     print(np.array(docunment)[index])
35 |     return index
36 | 
37 | 


--------------------------------------------------------------------------------
/rank/train_ndcg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from transformers import BertTokenizer
  3 | from model import BertForNDCG
  4 | import pandas as pd
  5 | import torch
  6 | from utils import get_device
  7 | from tqdm import tqdm
  8 | from torch.utils.data import DataLoader, Dataset
  9 | 
 10 | 
 11 | def dcg(score):
 12 |     index = list(range(1, len(score[0]) + 1))
 13 |     return score[:, 0] + np.sum(score[:, 1:] / np.log2(index[1:]), axis=1)
 14 | 
 15 | 
 16 | def ndcg(score):
 17 |     if not isinstance(score, np.ndarray):
 18 |         score = np.array(score)
 19 |     if score.ndim == 1:
 20 |         score = score[None, :]
 21 |     dcg_score = dcg(score)
 22 |     idcg_score = dcg(np.sort(score[0][None, :])[:, ::-1])
 23 |     ndcg_socre = dcg_score / idcg_score
 24 |     return ndcg_socre
 25 | 
 26 | 
 27 | class BaseDataset(Dataset):
 28 |     def __init__(self, encodings, labels=None):
 29 |         self.encodings = encodings
 30 |         self.labels = labels
 31 | 
 32 |     def __getitem__(self, idx):
 33 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 34 |         if self.labels is not None:
 35 |             item['labels'] = torch.tensor(self.labels[idx])
 36 |         return item
 37 | 
 38 |     def __len__(self):
 39 |         return len(self.encodings['input_ids'])
 40 | 
 41 | 
 42 | batch_size = 20
 43 | device = get_device()
 44 | path = 'E:\\ptm\\roberta'
 45 | # path = ptm_path('roberta')
 46 | tokenizer = BertTokenizer.from_pretrained(path)
 47 | model = BertForNDCG.from_pretrained(path).to(device)
 48 | 
 49 | data = pd.read_csv('../data/rank/sort_data.csv')
 50 | text = data['text'].tolist()
 51 | all_text = []
 52 | query = []
 53 | document = []
 54 | 
 55 | q = ''
 56 | for i, t in enumerate(text):
 57 |     if i % 5 == 0:
 58 |         q = t
 59 |     query.append(q)
 60 |     document.append(t)
 61 | encoding = tokenizer(query[:-1000], document[:-1000], truncation=True, padding=True, max_length=128,
 62 |                      return_tensors='pt')
 63 | encoding_dev = tokenizer(query[-1000:], document[-1000:], truncation=True, padding=True, max_length=64,
 64 |                          return_tensors='pt')
 65 | 
 66 | train_loader = DataLoader(BaseDataset(encoding), batch_size=batch_size)
 67 | dev_loader = DataLoader(BaseDataset(encoding_dev), batch_size=batch_size)
 68 | 
 69 | 
 70 | def dev_func():
 71 |     model.eval()
 72 |     all_ndcg = []
 73 |     with torch.no_grad():
 74 |         for data in tqdm(dev_loader):
 75 |             outputs = model(input_ids=data['input_ids'].to(device),
 76 |                             attention_mask=data['attention_mask'].to(device),
 77 |                             token_type_ids=data['token_type_ids'].to(device),
 78 |                             num=5)
 79 |             logits = outputs[1]
 80 |             score = torch.argsort(logits) + 1
 81 |             a = ndcg(score.cpu().numpy())
 82 |             all_ndcg.extend(a)
 83 |     ndcg_score = np.mean(all_ndcg)
 84 |     print('ndcg:', ndcg_score)
 85 |     return ndcg_score
 86 | 
 87 | 
 88 | opt = torch.optim.Adam(lr=5e-5, params=model.parameters())
 89 | best_ndcg = 0
 90 | for epoch in range(10):
 91 |     model.train()
 92 |     pbar = tqdm(train_loader)
 93 |     for data in pbar:
 94 |         opt.zero_grad()
 95 |         outputs = model(input_ids=data['input_ids'].to(device),
 96 |                         attention_mask=data['attention_mask'].to(device),
 97 |                         token_type_ids=data['token_type_ids'].to(device),
 98 |                         num=5)
 99 |         loss, score = outputs[0], outputs[1]
100 |         loss.backward()
101 |         opt.step()
102 | 
103 |         pbar.update()
104 |         pbar.set_description(f'loss:{loss.item():.4f}')
105 | 
106 |     cur_ndcg = dev_func()
107 |     if cur_ndcg > best_ndcg:
108 |         best_ndcg = cur_ndcg
109 |         torch.save(model.state_dict(), 'best_model.bin')
110 | 


--------------------------------------------------------------------------------
/text_classification/bert.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import accuracy_score
  2 | from transformers import BertForSequenceClassification, BertTokenizer
  3 | from torch.utils.data import DataLoader, Dataset
  4 | from tqdm import tqdm
  5 | from utils import fix_seed
  6 | import torch
  7 | 
  8 | path = 'E:\\ptm\\roberta'
  9 | tokenizer = BertTokenizer.from_pretrained(path)
 10 | 
 11 | 
 12 | class BaseDataset(Dataset):
 13 |     def __init__(self, encodings, labels=None):
 14 |         self.encodings = encodings
 15 |         self.labels = labels
 16 | 
 17 |     def __getitem__(self, idx):
 18 |         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
 19 |         if self.labels is not None:
 20 |             item['labels'] = torch.tensor(self.labels[idx])
 21 |         return item
 22 | 
 23 |     def __len__(self):
 24 |         return len(self.encodings['input_ids'])
 25 | 
 26 | 
 27 | def load_data(batch_size=32):
 28 |     train_text = []
 29 |     train_label = []
 30 |     with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file:
 31 |         for line in file.readlines():
 32 |             t, l = line.strip().split('\t')
 33 |             train_text.append(t)
 34 |             train_label.append(int(l))
 35 | 
 36 |     train_text = tokenizer(text=train_text,
 37 |                            return_tensors='pt',
 38 |                            truncation=True,
 39 |                            padding=True,
 40 |                            max_length=10)
 41 | 
 42 |     train_loader = DataLoader(BaseDataset(train_text, train_label),
 43 |                               batch_size,
 44 |                               pin_memory=True if torch.cuda.is_available() else False,
 45 |                               shuffle=False)
 46 | 
 47 |     dev_text = []
 48 |     dev_label = []
 49 |     with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file:
 50 |         for line in file.readlines():
 51 |             t, l = line.strip().split('\t')
 52 |             dev_text.append(t)
 53 |             dev_label.append(int(l))
 54 | 
 55 |     dev_text = tokenizer(text=dev_text,
 56 |                          return_tensors='pt',
 57 |                          truncation=True,
 58 |                          padding=True,
 59 |                          max_length=10)
 60 | 
 61 |     dev_loader = DataLoader(BaseDataset(dev_text, dev_label),
 62 |                             batch_size,
 63 |                             pin_memory=True if torch.cuda.is_available() else False,
 64 |                             shuffle=False)
 65 | 
 66 |     return train_loader, dev_loader
 67 | 
 68 | 
 69 | # 训练模型
 70 | def train():
 71 |     fix_seed()
 72 | 
 73 |     train_data_loader, dev_data_loader = load_data(128)
 74 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 75 |     model = BertForSequenceClassification.from_pretrained(path, num_labels=2)
 76 |     model = model.to(device)
 77 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 78 | 
 79 |     for epoch in range(5):
 80 |         print('epoch:', epoch + 1)
 81 |         pred = []
 82 |         label = []
 83 |         pbar = tqdm(train_data_loader)
 84 |         for data in pbar:
 85 |             optimizer.zero_grad()
 86 | 
 87 |             input_ids = data['input_ids'].to(device)
 88 |             attention_mask = data['attention_mask'].to(device)
 89 |             labels = data['labels'].to(device).long()
 90 | 
 91 |             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
 92 |             output = outputs.logits.argmax(1).cpu().numpy()
 93 |             pred.extend(output)
 94 |             label.extend(labels.cpu().numpy())
 95 |             loss = outputs.loss
 96 |             loss.backward()
 97 | 
 98 |             optimizer.step()
 99 | 
100 |             pbar.update()
101 |             pbar.set_description(f'loss:{loss.item():.4f}')
102 | 
103 |         acc = accuracy_score(pred, label)
104 |         print('train acc:', acc)
105 | 
106 |         pred = []
107 |         label = []
108 |         for data in tqdm(dev_data_loader):
109 |             input_ids = data['input_ids'].to(device)
110 |             attention_mask = data['attention_mask'].to(device)
111 |             labels = data['labels'].to(device).long()
112 |             with torch.no_grad():
113 |                 outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
114 |             output = outputs.logits.argmax(1).cpu().numpy()
115 |             pred.extend(output)
116 |             label.extend(labels.cpu().numpy())
117 |         acc = accuracy_score(pred, label)
118 |         print('dev acc:', acc)
119 |         print()
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     train()
124 | 


--------------------------------------------------------------------------------
/text_classification/text_classification.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import jieba
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | from torch.utils.data import DataLoader, TensorDataset
  7 | from sklearn.metrics import accuracy_score
  8 | from utils import fix_seed
  9 | 
 10 | 
 11 | class TextCLS(torch.nn.Module):
 12 |     # 准备我们需要用到的参数和layer
 13 |     def __init__(self,
 14 |                  vocab_size,
 15 |                  embedding_size):
 16 |         super().__init__()
 17 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 18 |         # [batch_size, seq_len, hidden_size]
 19 |         self.lstm = nn.LSTM(input_size=embedding_size,
 20 |                             hidden_size=256,
 21 |                             num_layers=2,
 22 |                             batch_first=True)
 23 |         self.dense1 = nn.Linear(256, 100)
 24 |         self.dense2 = nn.Linear(100, 2)
 25 | 
 26 |     # 前向传播，那我们准备好的layer拼接在一起
 27 |     def forward(self, x):
 28 |         embedding = self.embedding(x)
 29 |         # [batch_size, seq_len, hidden_size]
 30 |         out, _ = self.lstm(embedding)
 31 |         out = self.dense1(out[:, -1, :])
 32 |         out = self.dense2(out)
 33 |         return out
 34 | 
 35 | 
 36 | def tokenize(string):
 37 |     res = list(jieba.cut(string, cut_all=False))
 38 |     return res
 39 | 
 40 | 
 41 | # 把数据转换成index
 42 | def seq2index(seq, vocab):
 43 |     seg = tokenize(seq)
 44 |     seg_index = []
 45 |     for s in seg:
 46 |         seg_index.append(vocab.get(s, 1))
 47 |     return seg_index
 48 | 
 49 | 
 50 | # 统一长度
 51 | def padding_seq(X, max_len=10):
 52 |     return np.array([
 53 |         np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
 54 |     ])
 55 | 
 56 | 
 57 | def load_data(batch_size=32):
 58 |     train_text = []
 59 |     train_label = []
 60 |     with open('../data/sentiment/sentiment.train.data', encoding='utf-8')as file:
 61 |         for line in file.readlines():
 62 |             t, l = line.strip().split('\t')
 63 |             train_text.append(t)
 64 |             train_label.append(int(l))
 65 | 
 66 |     dev_text = []
 67 |     dev_label = []
 68 |     with open('../data/sentiment/sentiment.valid.data', encoding='utf-8')as file:
 69 |         for line in file.readlines():
 70 |             t, l = line.strip().split('\t')
 71 |             dev_text.append(t)
 72 |             dev_label.append(int(l))
 73 | 
 74 |     # 生成词典
 75 |     segment = [tokenize(t) for t in train_text]
 76 | 
 77 |     word_frequency = defaultdict(int)
 78 |     for row in segment:
 79 |         for i in row:
 80 |             word_frequency[i] += 1
 81 | 
 82 |     word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序
 83 | 
 84 |     vocab = {'[PAD]': 0, '[UNK]': 1}
 85 |     for d in word_sort:
 86 |         vocab[d[0]] = len(vocab)
 87 | 
 88 |     train_x = padding_seq([seq2index(t, vocab) for t in train_text])
 89 |     train_y = np.array(train_label)
 90 |     train_data_set = TensorDataset(torch.from_numpy(train_x),
 91 |                                    torch.from_numpy(train_y))
 92 |     train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size)
 93 | 
 94 |     dev_x = padding_seq([seq2index(t, vocab) for t in dev_text])
 95 |     dev_y = np.array(dev_label)
 96 |     dev_data_set = TensorDataset(torch.from_numpy(dev_x),
 97 |                                  torch.from_numpy(dev_y))
 98 |     dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size)
 99 | 
100 |     return train_data_loader, dev_data_loader, vocab
101 | 
102 | 
103 | # 训练模型
104 | def train():
105 |     fix_seed()
106 | 
107 |     train_data_loader, dev_data_loader, vocab = load_data(128)
108 |     model = TextCLS(vocab_size=len(vocab),
109 |                     embedding_size=100)
110 | 
111 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
112 |     loss_func = nn.CrossEntropyLoss()
113 | 
114 |     if torch.cuda.is_available():
115 |         model = model.cuda()
116 | 
117 |     for epoch in range(5):
118 |         print('epoch:', epoch + 1)
119 |         pred = []
120 |         label = []
121 |         for step, (b_x, b_y) in enumerate(train_data_loader):
122 |             if torch.cuda.is_available():
123 |                 b_x = b_x.cuda().long()
124 |                 b_y = b_y.cuda().long()
125 |             output = model(b_x)
126 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
127 |             label.extend(b_y.cpu().numpy())
128 |             loss = loss_func(output, b_y)
129 |             optimizer.zero_grad()
130 |             # 求解梯度
131 |             loss.backward()
132 |             # 更新我们的权重
133 |             optimizer.step()
134 |         acc = accuracy_score(pred, label)
135 |         print('train acc:', acc)
136 | 
137 |         pred = []
138 |         label = []
139 |         for step, (b_x, b_y) in enumerate(dev_data_loader):
140 |             if torch.cuda.is_available():
141 |                 b_x = b_x.cuda().long()
142 |                 b_y = b_y.cuda().long()
143 |             with torch.no_grad():
144 |                 output = model(b_x)
145 |             pred.extend(torch.argmax(output, dim=1).cpu().numpy())
146 |             label.extend(b_y.cpu().numpy())
147 |         acc = accuracy_score(pred, label)
148 |         print('dev acc:', acc)
149 |         print()
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     train()
154 | 


--------------------------------------------------------------------------------
/text_representation/sentence_embedding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from tqdm import tqdm
 4 | from transformers import BertModel, BertTokenizer
 5 | 
 6 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 7 | 
 8 | 
 9 | class SentenceEmbedding:
10 |     def __init__(self):
11 |         model_path = 'E:\\ptm\\simbert'
12 |         self.model = BertModel.from_pretrained(model_path)
13 |         self.model.to(device)
14 |         self.model.eval()
15 |         self.tokenizer = BertTokenizer.from_pretrained(model_path)
16 | 
17 |     def encode(self, content, batch_size=256, max_length=None, padding='max_length'):
18 |         outputs = None
19 |         if isinstance(content, list) and len(content) > batch_size:
20 |             for epoch in tqdm(range(len(content) // batch_size + 1)):
21 |                 batch_content = content[epoch * batch_size:(epoch + 1) * batch_size]
22 |                 if batch_content:
23 |                     output = self._embedding(batch_content, max_length, padding)
24 |                     if outputs is None:
25 |                         outputs = output
26 |                     else:
27 |                         outputs = np.concatenate([outputs, output], axis=0)
28 |             return outputs
29 |         else:
30 |             return self._embedding(content, max_length, padding)
31 | 
32 |     def _embedding(self, content, max_length, padding):
33 | 
34 |         if max_length is None:
35 |             if isinstance(content, str):
36 |                 max_length = len(content) + 2
37 |             else:
38 |                 max_length = max([len(c) for c in content]) + 2
39 |         max_length = min(max_length, 512)
40 |         inputs = self.tokenizer(content,
41 |                                 return_tensors="pt",
42 |                                 truncation=True,
43 |                                 padding=padding,
44 |                                 max_length=max_length)
45 |         with torch.no_grad():
46 |             outputs = self.model(**inputs.to(device))
47 |             output = outputs[1].cpu().numpy()
48 | 
49 |         return output
50 | 


--------------------------------------------------------------------------------
/text_representation/synonym.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gensim
 3 | 
 4 | model = gensim.models.Word2Vec.load('word2vec/wiki.ptm')
 5 | embedding = model.wv
 6 | 
 7 | 
 8 | def cosine(a, b):
 9 |     return np.matmul(a, b.T) / np.linalg.norm(a) / np.linalg.norm(b, axis=-1)
10 | 
11 | 
12 | def search(word, topk=3):
13 |     we = embedding[word]
14 |     similarity = cosine(we, embedding.vectors)
15 |     index = np.argsort(-similarity)
16 |     w = np.array(embedding.index2word)[index[0:topk]]
17 |     print(w)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     while 1:
22 |         text = input('word:')
23 |         search(text)
24 | 


--------------------------------------------------------------------------------
/text_representation/word2vec/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/terrifyzhao/nlp_tutorial/fa5cfdf732972469bfce2c452c07bec2077ba407/text_representation/word2vec/.gitkeep


--------------------------------------------------------------------------------
/text_representation/word2vec_gensim.py:
--------------------------------------------------------------------------------
 1 | from gensim.models import Word2Vec
 2 | from gensim.models.word2vec import LineSentence
 3 | import multiprocessing
 4 | 
 5 | input_file = 'word2vec/wiki.txt'
 6 | out_file = 'word2vec/wiki.model'
 7 | 
 8 | model = Word2Vec(LineSentence(input_file),
 9 |                  size=100,
10 |                  window=5,
11 |                  min_count=5,
12 |                  workers=multiprocessing.cpu_count(),
13 |                  sg=1,
14 |                  hs=0,
15 |                  negative=5)
16 | 
17 | model.save(out_file)
18 | 


--------------------------------------------------------------------------------
/text_similarity/dssm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class DSSM(nn.Module):
 6 |     def __init__(self,
 7 |                  char_vocab_size,
 8 |                  char_dim=100,
 9 |                  hidden_size=128):
10 |         super(DSSM, self).__init__()
11 | 
12 |         self.char_embedding = nn.Embedding(char_vocab_size, char_dim)
13 | 
14 |         self.fc1 = nn.Linear(100, hidden_size)
15 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
16 | 
17 |         self.dropout = nn.Dropout(0.2)
18 | 
19 |     def forward(self, char_p, char_q):
20 |         p_embedding = self.char_embedding(char_p.long())
21 |         q_embedding = self.char_embedding(char_q.long())
22 | 
23 |         p = torch.tanh(self.fc1(p_embedding))
24 |         q = torch.tanh(self.fc1(q_embedding))
25 |         p = self.dropout(p)
26 |         q = self.dropout(q)
27 |         p = self.fc2(p)
28 |         q = self.fc2(q)
29 | 
30 |         p = torch.mean(p, dim=1)
31 |         q = torch.mean(q, dim=1)
32 | 
33 |         cosine = torch.cosine_similarity(p, q)
34 |         cosine[cosine < 0] = 0
35 | 
36 |         return cosine
37 | 


--------------------------------------------------------------------------------
/text_similarity/esim.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ESIM(nn.Module):
 6 |     def __init__(self,
 7 |                  vocab_size,
 8 |                  embedding_size=100,
 9 |                  hidden_size=128,
10 |                  max_len=10):
11 |         super(ESIM, self).__init__()
12 | 
13 |         # Word Representation Layer
14 |         self.char_embedding = nn.Embedding(vocab_size, embedding_size)
15 |         # self.word_embedding = nn.Embedding(word_vocab_size, word_dim)
16 | 
17 |         self.char_LSTM = nn.LSTM(
18 |             input_size=embedding_size,
19 |             hidden_size=hidden_size,
20 |             num_layers=1,
21 |             bidirectional=True,
22 |             batch_first=True)
23 | 
24 |         # Context Representation Layer
25 |         self.context_LSTM = nn.LSTM(
26 |             input_size=hidden_size * 8,
27 |             hidden_size=hidden_size,
28 |             num_layers=1,
29 |             bidirectional=True,
30 |             batch_first=True)
31 | 
32 |         # ----- Prediction Layer -----
33 |         self.max_pool1 = nn.MaxPool2d((max_len, 1))
34 |         self.max_pool2 = nn.MaxPool2d((max_len, 1))
35 | 
36 |         self.fc1 = nn.Linear(hidden_size * 8, hidden_size)
37 |         self.fc2 = nn.Linear(hidden_size, 2)
38 | 
39 |         self.dropout = nn.Dropout(0.2)
40 | 
41 |     def forward(self, char_p, char_q):
42 |         p_embedding, _ = self.char_LSTM(self.char_embedding(char_p.long()))
43 |         q_embedding, _ = self.char_LSTM(self.char_embedding(char_q.long()))
44 | 
45 |         p_embedding = self.dropout(p_embedding)
46 |         q_embedding = self.dropout(q_embedding)
47 | 
48 |         # attention
49 |         e = torch.matmul(p_embedding, torch.transpose(q_embedding, 1, 2))
50 |         p_hat = torch.matmul(torch.softmax(e, dim=2), q_embedding)
51 |         q_hat = torch.matmul(torch.transpose(torch.softmax(e, dim=1), 1, 2), p_embedding)
52 | 
53 |         p_cat = torch.cat([p_embedding, p_hat, p_embedding - p_hat, p_embedding * p_hat], dim=2)
54 |         q_cat = torch.cat([q_embedding, q_hat, q_embedding - q_hat, q_embedding * q_hat], dim=2)
55 | 
56 |         p, _ = self.context_LSTM(p_cat)
57 |         q, _ = self.context_LSTM(q_cat)
58 | 
59 |         p_max = self.max_pool1(p).squeeze(dim=1)
60 |         q_max = self.max_pool2(q).squeeze(dim=1)
61 | 
62 |         p_mean = torch.mean(p, dim=1)
63 |         q_mean = torch.mean(q, dim=1)
64 | 
65 |         x = torch.cat([p_max, q_max, p_mean, q_mean], dim=1)
66 |         x = self.dropout(x)
67 | 
68 |         # ----- Prediction Layer -----
69 |         x = torch.tanh(self.fc1(x))
70 |         x = self.dropout(x)
71 |         x = self.fc2(x)
72 |         return x
73 | 


--------------------------------------------------------------------------------
/text_similarity/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from collections import defaultdict
  4 | from torch.utils.data import DataLoader, TensorDataset
  5 | from sklearn.metrics import accuracy_score
  6 | from text_similarity.esim import ESIM
  7 | from text_similarity.dssm import DSSM
  8 | from utils import *
  9 | import pandas as pd
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | def load_data(batch_size=32):
 14 |     train = pd.read_csv('../data/LCQMC/lcqmc_train.csv')
 15 |     dev = pd.read_csv('../data/LCQMC/lcqmc_dev.csv')
 16 | 
 17 |     text = train['sentence1'].tolist()
 18 |     text.extend(train['sentence2'].tolist())
 19 |     text.extend(dev['sentence1'].tolist())
 20 |     text.extend(dev['sentence2'].tolist())
 21 | 
 22 |     # 生成词典
 23 |     segment = [tokenize(t) for t in text]
 24 | 
 25 |     word_frequency = defaultdict(int)
 26 |     for row in segment:
 27 |         for i in row:
 28 |             word_frequency[i] += 1
 29 | 
 30 |     word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序
 31 | 
 32 |     vocab = {'[PAD]': 0, '[UNK]': 1}
 33 |     for d in word_sort:
 34 |         vocab[d[0]] = len(vocab)
 35 | 
 36 |     train_x1 = padding_seq([seq2index(t, vocab) for t in train['sentence1'].tolist()])
 37 |     train_x2 = padding_seq([seq2index(t, vocab) for t in train['sentence2'].tolist()])
 38 |     train_data_set = TensorDataset(torch.from_numpy(train_x1),
 39 |                                    torch.from_numpy(train_x2),
 40 |                                    torch.from_numpy(train['label'].values))
 41 |     train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size)
 42 | 
 43 |     dev_x1 = padding_seq([seq2index(t, vocab) for t in dev['sentence1'].tolist()])
 44 |     dev_x2 = padding_seq([seq2index(t, vocab) for t in dev['sentence2'].tolist()])
 45 |     dev_data_set = TensorDataset(torch.from_numpy(dev_x1),
 46 |                                  torch.from_numpy(dev_x2),
 47 |                                  torch.from_numpy(dev['label'].values))
 48 |     dev_data_loader = DataLoader(dataset=dev_data_set, batch_size=batch_size)
 49 | 
 50 |     return train_data_loader, dev_data_loader, vocab
 51 | 
 52 | 
 53 | # 训练模型
 54 | def train():
 55 |     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 56 |     train_data_loader, dev_data_loader, vocab = load_data(32)
 57 |     model = ESIM(vocab_size=len(vocab),
 58 |                  embedding_size=100,
 59 |                  hidden_size=128,
 60 |                  max_len=10)
 61 |     # model = DSSM(vocab_len=len(vocab),
 62 |     #              embedding_size=100,
 63 |     #              hidden_size=128)
 64 |     model = model.to(device)
 65 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 66 |     # loss_func = nn.BCELoss()
 67 |     loss_func = nn.CrossEntropyLoss()
 68 | 
 69 |     for epoch in range(5):
 70 |         pred = []
 71 |         label = []
 72 |         for step, (x1, x2, y) in tqdm(enumerate(train_data_loader)):
 73 |             x1 = x1.to(device)
 74 |             x2 = x2.to(device)
 75 |             y = y.to(device)
 76 | 
 77 |             # 前向传播
 78 |             output = model(x1.long(), x2.long())
 79 |             loss = loss_func(output, y)
 80 |             optimizer.zero_grad()
 81 | 
 82 |             pred.extend(torch.argmax(output.detach().cpu(), dim=1).numpy())
 83 |             label.extend(y.cpu().numpy())
 84 | 
 85 |             # 反向传播
 86 |             loss.backward()
 87 |             # 更新我们的权重
 88 |             optimizer.step()
 89 |         acc = accuracy_score(pred, label)
 90 |         print('train acc:', acc)
 91 | 
 92 |         pred = []
 93 |         label = []
 94 |         for step, (x1, x2, y) in tqdm(enumerate(dev_data_loader)):
 95 |             x1 = x1.to(device)
 96 |             x2 = x2.to(device)
 97 |             y = y.to(device)
 98 |             with torch.no_grad():
 99 |                 output = model(x1.long(), x2.long())
100 |             pred.extend(torch.argmax(output.detach().cpu(), dim=1).numpy())
101 |             label.extend(y.cpu().numpy())
102 |         acc = accuracy_score(pred, label)
103 |         print('dev acc:', acc)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     train()
108 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import jieba
 2 | import numpy as np
 3 | import os
 4 | import random
 5 | import torch
 6 | 
 7 | 
 8 | def tokenize(string):
 9 |     res = list(jieba.cut(string, cut_all=False))
10 |     return res
11 | 
12 | 
13 | # 把数据转换成index
14 | def seq2index(seq, vocab):
15 |     seg = tokenize(seq)
16 |     seg_index = []
17 |     for s in seg:
18 |         seg_index.append(vocab.get(s, 1))
19 |     return seg_index
20 | 
21 | 
22 | # 统一长度
23 | def padding_seq(X, max_len=10):
24 |     return np.array([
25 |         np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
26 |     ])
27 | 
28 | 
29 | def fix_seed(seed=3407):
30 |     random.seed(seed)
31 |     os.environ['PYTHONHASHSEED'] = str(seed)
32 |     np.random.seed(seed)
33 |     torch.manual_seed(seed)
34 |     torch.cuda.manual_seed(seed)
35 |     torch.backends.cudnn.deterministic = True
36 | 
37 | 
38 | def random_mask(input_ids, tokenizer):
39 |     length = len(input_ids)
40 |     # 移除pad cls sep
41 |     input_ids = input_ids[1:-1]
42 |     prob = np.random.random(len(input_ids))
43 |     source, target = [], []
44 |     # cls:[101]
45 |     source.append(101)
46 |     target.append(-100)
47 |     # p->[0:1]
48 |     for p, ids in zip(prob, input_ids):
49 |         if p < 0.15 * 0.8:
50 |             source.append(tokenizer.mask_token_id)
51 |             target.append(ids)
52 |         elif p < 0.15 * 0.9:
53 |             source.append(ids)
54 |             target.append(ids)
55 |         elif p < 0.15:
56 |             source.append(np.random.choice(tokenizer.vocab_size))
57 |             target.append(ids)
58 |         else:
59 |             source.append(ids)
60 |             target.append(-100)
61 |     # sep:[102]
62 |     source.append(102)
63 |     target.append(-100)
64 |     while len(source) < length:
65 |         source.append(0)
66 |         target.append(-100)
67 |     return source, target
68 | 
69 | 
70 | def punctuation():
71 |     import string
72 |     en_punctuation = list(string.punctuation)
73 |     zh_punctuation = ['，', '。', '：', '！', '？', '《', '》', '"', '；', "'"]
74 |     return en_punctuation + zh_punctuation
75 | 
76 | 
77 | def get_device():
78 |     return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
79 | 
80 | 
81 | def one_hot(x, n_class):
82 |     return torch.nn.functional.one_hot(x, num_classes=n_class)
83 | 
84 | 
85 | def cos_sim(a, b):
86 |     a = np.array(a)
87 |     b = np.array(b)
88 |     if len(a.shape) == 1:
89 |         a = a[None, :]
90 |     if len(b.shape) == 1:
91 |         b = b[None, :]
92 | 
93 |     a_norm = a / np.linalg.norm(a, axis=-1)[:, None]
94 |     b_norm = b / np.linalg.norm(b, axis=-1)[:, None]
95 |     cosine = np.matmul(a_norm, b_norm.T)
96 |     return cosine
97 | 


--------------------------------------------------------------------------------