├── .gitignore ├── NLP └── word2vec │ └── word2vec.py ├── README.md ├── content_embedding └── bert_whitening │ └── all_utils.py ├── halite-banner.gif ├── inference └── onnxruntime_cpp │ ├── bert_onnx_cpp_test.cpp │ └── bert_onxx_test.ipynb ├── notes ├── CS224N-2019 │ ├── CS224N-01-Introduction-and-Word-Vectors.md │ └── img │ │ ├── 2020-06-12-12-42-12.png │ │ ├── 2020-06-12-13-22-31.png │ │ ├── 2020-06-12-13-24-36.png │ │ ├── 2020-06-12-15-11-11.png │ │ ├── 2020-06-12-15-37-09.png │ │ ├── 2020-06-12-15-42-22.png │ │ ├── 微信图片_20200612171355.jpg │ │ └── 微信截图_20200612183129.png └── Word2Vec学习笔记(CS224N笔记及相关论文学习).md ├── text_classification ├── examples │ └── test_demo.py ├── models │ ├── BaseModel.py │ ├── FastText.py │ ├── TextBiLSTM.py │ ├── TextCNN.py │ ├── TextRCNN.py │ ├── TextRNN.py │ └── __init__.py ├── online │ └── utils │ │ ├── ckpt2pb.py │ │ └── ckpt2save.py ├── text_data │ └── raw_data │ │ ├── test.txt │ │ └── train.csv └── utils │ ├── __init__.py │ ├── data_helper.py │ └── generate_w2v.py ├── text_matching ├── esim │ ├── ESIM.py │ └── SoftAttention.py └── sentence-bert │ └── SBERT.py └── tiny_transformer ├── configuration.py ├── dataset.py ├── model.py ├── test_model.py ├── tests └── dataset.py ├── train.py ├── train_data.py ├── trainer.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | *.pyc 4 | __pycache__/ 5 | .idea/ 6 | *.zip 7 | data/ 8 | checkpoints/ 9 | input_data/ 10 | w2v_model/ 11 | model_save/ 12 | pb_model/ 13 | save_model/ 14 | 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | *$py.class 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | pip-wheel-metadata/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | *.py,cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | db.sqlite3-journal 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # IPython 95 | profile_default/ 96 | ipython_config.py 97 | 98 | # pyenv 99 | .python-version 100 | 101 | # pipenv 102 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 103 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 104 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 105 | # install all needed dependencies. 106 | #Pipfile.lock 107 | 108 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 109 | __pypackages__/ 110 | 111 | # Celery stuff 112 | celerybeat-schedule 113 | celerybeat.pid 114 | 115 | # SageMath parsed files 116 | *.sage.py 117 | 118 | # Environments 119 | .env 120 | .venv 121 | env/ 122 | venv/ 123 | ENV/ 124 | env.bak/ 125 | venv.bak/ 126 | 127 | # Spyder project settings 128 | .spyderproject 129 | .spyproject 130 | 131 | # Rope project settings 132 | .ropeproject 133 | 134 | # mkdocs documentation 135 | /site 136 | 137 | # mypy 138 | .mypy_cache/ 139 | .dmypy.json 140 | dmypy.json 141 | 142 | # Pyre type checker 143 | .pyre/ -------------------------------------------------------------------------------- /NLP/word2vec/word2vec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/NLP/word2vec/word2vec.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![from kaggle](halite-banner.gif) 4 |

NLP-Space

5 |

papers read & learning notes & some code

6 | 7 |
8 | 9 | --- 10 | 11 | ### Papers 12 | 13 | | Model | Title | Resources | Remarks | 14 | |-------|----------|------------|------| 15 | |Word2Vec|Efficient Estimation of Word Representations in Vector Space|[[paper]](https://arxiv.org/pdf/1301.3781.pdf)|------| 16 | |negative sampling|Distributed Representations of Words and Phrases and their Compositionality |[[paper]](https://arxiv.org/abs/1310.4546)|------| 17 | |Transformer|Attention Is All You Need|[[paper]](https://arxiv.org/abs/1706.03762)|Google2017| 18 | |Bert|Pre-training of Deep Bidirectional Transformers for Language Understanding|[[paper](https://arxiv.org/abs/1810.04805)]|Google2018| 19 | 20 | 21 | ### Learning-Notes 22 | 23 | [【斯坦福CS224N学习笔记】01-Introduction and Word Vectors](https://zhuanlan.zhihu.com/p/147889351) 24 | [Word2Vec学习笔记(SVD、原理推导)](https://zhuanlan.zhihu.com/p/148779268) 25 | 26 | 27 | ### Text Classification 28 | * [x] Utils 29 | * [x] [generate_w2v](./text_classification/utils/generate_w2v.py): train word embedding using gensim. 30 | * [x] [data_helper](./text_classification/utils/data_helper.py): load datasets and data clearning, split to train and valid data. 31 | * [x] [BaseModel](./text_classification/models/BaseModel.py): a base model, including parameters initialization, embedding initialization, loss function and accuracy, some base api like compile, fit and predict. etc. 32 | * [x] [FastText](./text_classification/models/FastText.py) 33 | * [x] [TextCNN](./text_classification/models/TextCNN.py) 34 | * [x] [TextRNN](./text_classification/models/TextRNN.py) 35 | * [x] [TextBiLSTM](./text_classification/models/TextBiLSTM.py) 36 | * [ ] [TextRCNN](./text_classification/models/TextRCNN.py) 37 | * [ ] HAN 38 | * [ ] BiLSTM+Attention 39 | * [ ] Transformer 40 | * [ ] ... 41 | 42 | ### NER 43 | 44 | * [ ] BiLSTM+CRF 45 | * [ ] Bert+CRF 46 | * [ ] Bert+BiLSTM+CRF 47 | 48 | ### Content Embedding 49 | 50 | * [x] Bert-Whitening 51 | * [x] Sentence-Bert 52 | * [x] SimCSE 53 | * [ ] ESimCSE 54 | 55 | ### Text Matching 56 | 57 | * [ ] Siamese LSTM 58 | * [ ] DSSM 59 | * [x] ESIM 60 | * [ ] DIIN 61 | 62 | ### Text Generation 63 | 64 | * [ ] 65 | 66 | ### Inference 67 | 68 | * [x] ONNX (OnnxRuntime by CPP) 69 | * [ ] TensorRT -------------------------------------------------------------------------------- /content_embedding/bert_whitening/all_utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import sys 4 | import torch 5 | import numpy as np 6 | from transformers import BertModel, BertTokenizer 7 | from tqdm import tqdm 8 | import scipy.stats 9 | import pickle 10 | import requests 11 | 12 | 13 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 14 | 15 | 16 | def build_model(name): 17 | tokenizer = BertTokenizer.from_pretrained(name) 18 | model = BertModel.from_pretrained(name) 19 | model = model.to(DEVICE) 20 | return tokenizer, model 21 | 22 | 23 | def sent_to_vec(sent, tokenizer, model, pooling, max_length): 24 | with torch.no_grad(): 25 | inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True, max_length=max_length) 26 | inputs['input_ids'] = inputs['input_ids'].to(DEVICE) 27 | inputs['token_type_ids'] = inputs['token_type_ids'].to(DEVICE) 28 | inputs['attention_mask'] = inputs['attention_mask'].to(DEVICE) 29 | 30 | # hidden_states = model(**inputs, return_dict=True, output_hidden_states=True).hidden_states 31 | outputs = model(**inputs, output_hidden_states=True) 32 | hidden_states = outputs[2] 33 | 34 | if pooling == 'first_last_avg': 35 | output_hidden_state = (hidden_states[-1] + hidden_states[1]).mean(dim=1) 36 | elif pooling == 'last_avg': 37 | output_hidden_state = (hidden_states[-1]).mean(dim=1) 38 | elif pooling == 'last2avg': 39 | output_hidden_state = (hidden_states[-1] + hidden_states[-2]).mean(dim=1) 40 | elif pooling == 'cls': 41 | # output_hidden_state = (hidden_states[-1])[:, 0, :] 42 | output_hidden_state = outputs[1] 43 | else: 44 | raise Exception("unknown pooling {}".format(POOLING)) 45 | 46 | vec = output_hidden_state.cpu().numpy() 47 | return vec 48 | 49 | 50 | def sents_to_vecs(sents, tokenizer, model, pooling, max_length, batch_size=64): 51 | vecs = [] 52 | if batch_size: 53 | for i in tqdm(range(int(len(sents) / batch_size)+1)): 54 | m, n = i*batch_size, (i+1)*batch_size 55 | sent = sents[m:n] 56 | vec = sent_to_vec(sent, tokenizer, model, pooling, max_length) 57 | vecs.append(vec) 58 | vecs = np.concatenate(vecs) 59 | assert len(sents) == vecs.shape[0] 60 | else: 61 | for sent in tqdm(sents): 62 | vec = sent_to_vec(sent, tokenizer, model, pooling, max_length) 63 | vecs.append(vec[0]) 64 | assert len(sents) == len(vecs) 65 | vecs = np.array(vecs) 66 | return vecs 67 | 68 | 69 | def calc_spearmanr_corr(x, y): 70 | return scipy.stats.spearmanr(x, y).correlation 71 | 72 | 73 | # def compute_kernel_bias(vecs): 74 | # """计算kernel和bias 75 | # 最后的变换:y = (x + bias).dot(kernel) 76 | # """ 77 | # vecs = np.concatenate(vecs, axis=0) 78 | # mu = vecs.mean(axis=0, keepdims=True) 79 | # cov = np.cov(vecs.T) 80 | # u, s, vh = np.linalg.svd(cov) 81 | # W = np.dot(u, np.diag(1/np.sqrt(s))) 82 | # return W, -mu 83 | 84 | 85 | def compute_kernel_bias(vecs, n_components): 86 | """计算kernel和bias 87 | 最后的变换:y = (x + bias).dot(kernel) 88 | """ 89 | vecs = np.concatenate(vecs, axis=0) 90 | mu = vecs.mean(axis=0, keepdims=True) 91 | cov = np.cov(vecs.T) 92 | u, s, vh = np.linalg.svd(cov) 93 | W = np.dot(u, np.diag(s**0.5)) 94 | W = np.linalg.inv(W.T) 95 | W = W[:, :n_components] 96 | return W, -mu 97 | 98 | 99 | def save_whiten(path, kernel, bias): 100 | whiten = { 101 | 'kernel': kernel, 102 | 'bias': bias 103 | } 104 | with open(path, 'wb') as f: 105 | pickle.dump(whiten, f) 106 | return path 107 | 108 | 109 | def load_whiten(path): 110 | with open(path, 'rb') as f: 111 | whiten = pickle.load(f) 112 | kernel = whiten['kernel'] 113 | bias = whiten['bias'] 114 | return kernel, bias 115 | 116 | 117 | def transform_and_normalize(vecs, kernel, bias): 118 | """应用变换,然后标准化 119 | """ 120 | if not (kernel is None or bias is None): 121 | vecs = (vecs + bias).dot(kernel) 122 | return normalize(vecs) 123 | 124 | 125 | def normalize(vecs): 126 | """标准化 127 | """ 128 | return vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5 129 | -------------------------------------------------------------------------------- /halite-banner.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/halite-banner.gif -------------------------------------------------------------------------------- /inference/onnxruntime_cpp/bert_onnx_cpp_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define USE_CPU // Chnage USE_CPU to USE_CUDA 8 | 9 | #ifdef USE_CUDA 10 | #include "cuda_provider_factory.h" 11 | #endif // CUDA GPU Enabled 12 | 13 | // export LD_LIBRARY_PATH=${}/onnxruntime-linux-x64-1.9.0/lib:$LD_LIBRARY_PATH 14 | // export LD_LIBRARY_PATH=${}/onnxruntime-linux-x64-gpu-1.9.0/lib:$LD_LIBRARY_PATH 15 | // g++ a.cpp -o a ${}/onnxruntime-linux-x64-1.9.0/lib/libonnxruntime.so.1.9.0 -I ${}/onnxruntime-linux-x64-1.9.0/include/ -std=c++11 16 | // g++ a.cpp -o a ${}/onnxruntime-linux-x64-gpu-1.9.0/lib/libonnxruntime.so.1.9.0 -I ${}/onnxruntime-linux-x64-gpu-1.9.0/include/ -std=c++11 17 | 18 | 19 | int main() { 20 | int round = 1000; 21 | std::cout << round << std::endl; 22 | Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); 23 | Ort::SessionOptions session_options; 24 | // session_options.SetIntraOpNumThreads(1); 25 | // session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); 26 | 27 | #ifdef USE_CUDA 28 | Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); 29 | #endif // CUDA GPU Enabled 30 | 31 | const char* model_path = "./mybert.onnx"; 32 | Ort::Session session(env, model_path, session_options); 33 | 34 | //// print model input layer (node names, types, shape etc.) 35 | Ort::AllocatorWithDefaultOptions allocator; 36 | char* output_name = session.GetOutputName(0, allocator); 37 | std::cout << output_name << std::endl; 38 | 39 | std::vector input_node_names = {"input_ids", "token_type_ids", "attention_mask"}; 40 | std::vector output_node_names = {"logits"}; 41 | 42 | // input_ids 43 | std::vector input_ids_dims = {1, 82}; 44 | size_t input_ids_size = 1 * 82; 45 | auto memory_info_1 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); 46 | // std::vector input_ids_value = {101, 1037, 3899, 2003, 2770, 2006, 3509, 102}; 47 | // std::vector input_ids_value = {101 ,1037 ,3899 ,2003 ,2770 ,2006 ,3509 ,102 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0}; 48 | std::vector input_ids_value = {101, 11724, 8762, 12126, 8168, 150, 8179, 10006, 10600, 10168, 10614, 9738, 9107, 8847, 9479, 11839, 8521, 8361, 10168, 11014, 8217, 9568, 9116, 8809, 9470, 12183, 8877, 9145, 11233, 9428,8134, 11104, 12729, 8913, 11057, 9202, 9374, 8139, 9392, 8154,8231, 8606, 12126, 8168, 150, 8179, 10006, 10600, 8346, 8998,9019, 11685, 8797, 9749, 8675, 10447, 8328, 11399, 9796, 11588,8180, 10091, 9786, 8165, 11399, 10537, 10367, 10242, 8178, 10484,12619, 12465, 10361, 8178, 8343, 9531, 8171, 12280, 8317, 9194,8736, 102}; 49 | Ort::Value input_ids = Ort::Value::CreateTensor(memory_info_1, input_ids_value.data(), input_ids_size, input_ids_dims.data(), 2); 50 | assert(input_ids.IsTensor()); 51 | // token_type_ids 52 | std::vector token_type_ids_dims = {1, 82}; 53 | size_t token_type_ids_size = 1 * 82; 54 | auto memory_info_2 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); 55 | // std::vector token_type_ids_value = {0, 0, 0, 0, 0, 0, 0, 0}; 56 | std::vector token_type_ids_value; 57 | for (int i = 0; i < 82; ++ i) { 58 | token_type_ids_value.push_back(0); 59 | } 60 | Ort::Value token_type_ids = Ort::Value::CreateTensor(memory_info_2, token_type_ids_value.data(), token_type_ids_size, token_type_ids_dims.data(), 2); 61 | assert(token_type_ids.IsTensor()); 62 | // attention_mask 63 | std::vector attention_mask_dims = {1, 82}; 64 | size_t attention_mask_size = 1 * 82; 65 | auto memory_info_3 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); 66 | // std::vector attention_mask_value = {1, 1, 1, 1, 1, 1, 1, 1}; 67 | std::vector attention_mask_value; 68 | for (int i = 0; i < 82; ++ i) { 69 | attention_mask_value.push_back(1); 70 | } 71 | Ort::Value attention_mask = Ort::Value::CreateTensor(memory_info_3, attention_mask_value.data(), attention_mask_size, attention_mask_dims.data(), 2); 72 | assert(attention_mask.IsTensor()); 73 | 74 | std::vector ort_inputs; 75 | ort_inputs.push_back(std::move(input_ids)); 76 | ort_inputs.push_back(std::move(token_type_ids)); 77 | ort_inputs.push_back(std::move(attention_mask)); 78 | 79 | // test time 80 | auto begin = std::chrono::high_resolution_clock::now(); 81 | for (int i = 0; i < round; ++ i) { 82 | session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), ort_inputs.data(), ort_inputs.size(), output_node_names.data(), 1); 83 | } 84 | auto end = std::chrono::high_resolution_clock::now(); 85 | auto elapsed = std::chrono::duration_cast(end - begin); 86 | printf("time cost: %.3f seconds\n", elapsed.count() * 1e-9); 87 | // auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), ort_inputs.data(), ort_inputs.size(), output_node_names.data(), 2); 88 | 89 | // Get pointer to output tensor float values 90 | // auto type_info = output_tensors[1].GetTensorTypeAndShapeInfo(); 91 | // for (auto x: type_info.GetShape()) 92 | // std::cout << "shape " << x << std::endl; 93 | // std::cout << "len " << type_info.GetElementCount() << std::endl; 94 | // float* sequence = output_tensors[0].GetTensorMutableData(); 95 | // float* pooled = output_tensors[1].GetTensorMutableData(); 96 | // for (size_t i = 0; i != type_info.GetElementCount(); ++ i) { 97 | // std::cout << pooled[i] << " "; 98 | // } 99 | // std::cout << pooled[0] << std::endl; 100 | 101 | 102 | return 0; 103 | } -------------------------------------------------------------------------------- /inference/onnxruntime_cpp/bert_onxx_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import sys\n", 11 | "import pickle\n", 12 | "import time\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "import random\n", 16 | "\n", 17 | "import torch\n", 18 | "import torch.nn as nn\n", 19 | "import torch.nn.functional as F\n", 20 | "from torch.utils.data import Dataset,DataLoader\n", 21 | "import transformers\n", 22 | "from transformers import BertPreTrainedModel,BertModel,BertForSequenceClassification,BertTokenizer" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "model_path='/Users/zhangsongpo/Downloads/bert-base-chinese'\n", 32 | "max_length = 256" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "tokenizer = BertTokenizer.from_pretrained(model_path)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "class MyBert(nn.Module):\n", 51 | " def __init__(self, num_labels):\n", 52 | " super().__init__()\n", 53 | " self.num_labels = num_labels\n", 54 | "\n", 55 | " self.bert = BertModel.from_pretrained(model_path)\n", 56 | " classifier_dropout = 0.2\n", 57 | " self.dropout = nn.Dropout(classifier_dropout)\n", 58 | " self.classifier = nn.Linear(768, num_labels)\n", 59 | "\n", 60 | " \n", 61 | " def forward(\n", 62 | " self,\n", 63 | " input_ids=None,\n", 64 | " attention_mask=None,\n", 65 | " token_type_ids=None,\n", 66 | " position_ids=None,\n", 67 | " head_mask=None,\n", 68 | " inputs_embeds=None,\n", 69 | " output_attentions=None,\n", 70 | " output_hidden_states=None,\n", 71 | " ):\n", 72 | " outputs = self.bert(\n", 73 | " input_ids,\n", 74 | " attention_mask=attention_mask,\n", 75 | " token_type_ids=token_type_ids,\n", 76 | " position_ids=position_ids,\n", 77 | " head_mask=head_mask,\n", 78 | " inputs_embeds=inputs_embeds,\n", 79 | " output_attentions=output_attentions,\n", 80 | " output_hidden_states=output_hidden_states,\n", 81 | " )\n", 82 | "\n", 83 | " pooled_output = outputs[1]\n", 84 | "\n", 85 | " pooled_output = self.dropout(pooled_output)\n", 86 | " logits = self.classifier(pooled_output)\n", 87 | "\n", 88 | " return logits" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stderr", 98 | "output_type": "stream", 99 | "text": [ 100 | "Some weights of the model checkpoint at /Users/zhangsongpo/Downloads/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']\n", 101 | "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 102 | "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "my_bert = MyBert(num_labels=2)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "tmp_text = 'guideline hand hygiene health care setting recommendation healthcare infection control practice advisory committee hicpac shea ape idsa hand hygiene task force prepared john boyce md didier pittet md hospital saint raphael new haven connecticut university geneva geneva switzerland material report originate national center infectious disease james hughes md director division healthcare quality promotion steve solomon md acting director summary guideline hand hygiene health care setting health care worker hcw review data regarding handwash hand antisepsi health care setting addition specific recommendation promote improve hand hygiene practice reduce transmission pathogenic microorganism patient personnel health care setting report review study publish cdc guideline garner js favero cdc guideline handwash hospital environmental control infect control ape guideline larson el ape guidelines committee ape guideline handwash hand antisepsi health care setting infect control issue depth review hand hygiene practice hcw level adherence personnel recommend handwash practice factor adverse affecting adherence new study vivo efficacy alcohol base hand rub low incidence dermatitis associate use review recent study demonstrate value multidisciplinary hand hygiene promotion program potential role alcohol base hand rub improve hand hygiene practice summarize recommendation concerning related issue e use surgical hand antiseptic hand lotion cream wearing artificial fingernail part review scientific data regarding hand hygiene guideline hand hygiene health care setting recommendation healthcare infection control practice advisory committee hicpac shea ape idsa hand hygiene task force prepared john boyce md didier pittet md hospital saint raphael new haven connecticut university geneva geneva switzerland material report originate national center infectious disease james hughes md director division healthcare quality promotion steve'\n", 117 | "sample_text = []\n", 118 | "for _ in range(100):\n", 119 | " start_index = random.randint(0,200)\n", 120 | " text_len = random.randint(45, 500)\n", 121 | " sample_text.append(tmp_text[start_index:(start_index+text_len)])" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 30, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "inputs = tokenizer(text=tmp_text[:256],\n", 131 | " return_tensors=\"pt\",\n", 132 | " padding=True,\n", 133 | " truncation=True,\n", 134 | " max_length=max_length\n", 135 | " )" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 33, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "tensor([[ 101, 11724, 8762, 12126, 8168, 150, 8179, 10006, 10600, 10168,\n", 147 | " 10614, 9738, 9107, 8847, 9479, 11839, 8521, 8361, 10168, 11014,\n", 148 | " 8217, 9568, 9116, 8809, 9470, 12183, 8877, 9145, 11233, 9428,\n", 149 | " 8134, 11104, 12729, 8913, 11057, 9202, 9374, 8139, 9392, 8154,\n", 150 | " 8231, 8606, 12126, 8168, 150, 8179, 10006, 10600, 8346, 8998,\n", 151 | " 9019, 11685, 8797, 9749, 8675, 10447, 8328, 11399, 9796, 11588,\n", 152 | " 8180, 10091, 9786, 8165, 11399, 10537, 10367, 10242, 8178, 10484,\n", 153 | " 12619, 12465, 10361, 8178, 8343, 9531, 8171, 12280, 8317, 9194,\n", 154 | " 8736, 102]])" 155 | ] 156 | }, 157 | "execution_count": 33, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "inputs['input_ids']" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 26, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "0.29541802406311035\n", 176 | "CPU times: user 249 ms, sys: 53.6 ms, total: 303 ms\n", 177 | "Wall time: 296 ms\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "%%time\n", 183 | "s0 = time.time()\n", 184 | "res = my_bert(**inputs)\n", 185 | "print(time.time() - s0)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 27, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "ename": "KeyboardInterrupt", 195 | "evalue": "", 196 | "output_type": "error", 197 | "traceback": [ 198 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 199 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 200 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", 201 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 202 | "\u001b[0;32m\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions, output_hidden_states)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m )\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 203 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 204 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1007\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1008\u001b[0m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1009\u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1010\u001b[0m )\n\u001b[1;32m 1011\u001b[0m \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mencoder_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 205 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 206 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 590\u001b[0m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0mpast_key_value\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 592\u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 593\u001b[0m )\n\u001b[1;32m 594\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 207 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 208 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 513\u001b[0m layer_output = apply_chunking_to_forward(\n\u001b[0;32m--> 514\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunk_size_feed_forward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq_len_dim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 515\u001b[0m )\n\u001b[1;32m 516\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlayer_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 209 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/modeling_utils.py\u001b[0m in \u001b[0;36mapply_chunking_to_forward\u001b[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[1;32m 2359\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_chunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2360\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput_tensors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 210 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mfeed_forward_chunk\u001b[0;34m(self, attention_output)\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[0mintermediate_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintermediate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 526\u001b[0;31m \u001b[0mlayer_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintermediate_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 527\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlayer_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 211 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 212 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, input_tensor)\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 439\u001b[0;31m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 440\u001b[0m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLayerNorm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 213 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 214 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/linear.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 94\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextra_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 215 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mlinear\u001b[0;34m(input, weight, bias)\u001b[0m\n\u001b[1;32m 1690\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maddmm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1691\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1692\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatmul\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1693\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbias\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 216 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "%%time\n", 222 | "s0 = time.time()\n", 223 | "for _ in sample_text:\n", 224 | " res = my_bert(**inputs)\n", 225 | "all_time = time.time() - s0\n", 226 | "print(all_time, all_time / len(sample_text))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 19, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "tensor([[0.6906, 0.7076]], grad_fn=)" 238 | ] 239 | }, 240 | "execution_count": 19, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 9, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "output_names = ['logits']\n", 254 | "dynamic_axes = {'input_ids': [0, 1],'attention_mask': [0, 1],'token_type_ids': [0, 1],}" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 10, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stderr", 264 | "output_type": "stream", 265 | "text": [ 266 | "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/torch/onnx/utils.py:1112: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input input_ids\n", 267 | " 'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n", 268 | "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/torch/onnx/utils.py:1112: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input attention_mask\n", 269 | " 'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n", 270 | "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/torch/onnx/utils.py:1112: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input token_type_ids\n", 271 | " 'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n", 272 | "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py:200: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", 273 | " position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "torch.onnx.export(my_bert,\n", 279 | " f='./mybert.onnx',\n", 280 | " args=tuple(inputs.values()),\n", 281 | " input_names=list(inputs),\n", 282 | " output_names=output_names,\n", 283 | " dynamic_axes=dynamic_axes,\n", 284 | " opset_version=10)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 11, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "import onnx\n", 294 | "\n", 295 | "onnx_model = onnx.load('./mybert.onnx')\n", 296 | "onnx.checker.check_model(onnx_model)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 34, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "import onnxruntime" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 35, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "onxx_model_path = r'./mybert.onnx'\n", 336 | "options = onnxruntime.SessionOptions()\n", 337 | "session = onnxruntime.InferenceSession(onxx_model_path, options)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 36, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "inputs = tokenizer(text=tmp_text[:256],\n", 347 | " return_tensors=\"pt\",\n", 348 | " padding=True,\n", 349 | " truncation=True,\n", 350 | " max_length=max_length\n", 351 | " )\n", 352 | "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in inputs.items()}" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 43, 358 | "metadata": { 359 | "scrolled": true 360 | }, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "0.06377792358398438\n", 367 | "CPU times: user 241 ms, sys: 2.97 ms, total: 244 ms\n", 368 | "Wall time: 63.9 ms\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "%%time\n", 374 | "s0 = time.time()\n", 375 | "res = session.run(None, inputs_onnx)\n", 376 | "print(time.time() - s0)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 42, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "ename": "KeyboardInterrupt", 386 | "evalue": "", 387 | "output_type": "error", 388 | "traceback": [ 389 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 390 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 391 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", 392 | "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, output_names, input_feed, run_options)\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0moutput_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_outputs_meta\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 188\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_feed\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_options\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 189\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEPFail\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 190\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_enable_fallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 393 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "%%time\n", 399 | "s0 = time.time()\n", 400 | "for _ in sample_text:\n", 401 | " res = session.run(None, inputs_onnx)\n", 402 | "all_time = time.time() - s0\n", 403 | "print(all_time, all_time / len(sample_text))" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 24, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "from os import environ\n", 420 | "from psutil import cpu_count\n", 421 | "\n", 422 | "# Constants from the performance optimization available in onnxruntime\n", 423 | "# It needs to be done before importing onnxruntime\n", 424 | "environ[\"OMP_NUM_THREADS\"] = str(cpu_count(logical=True)) # OMP 的线程数\n", 425 | "environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'\n", 426 | "\n", 427 | "from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers\n", 428 | "\n", 429 | "\n", 430 | "def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: \n", 431 | " \n", 432 | " assert provider in get_all_providers(), f\"provider {provider} not found, {get_all_providers()}\"\n", 433 | "\n", 434 | " # Few properties that might have an impact on performances (provided by MS)\n", 435 | " options = SessionOptions()\n", 436 | " options.intra_op_num_threads = 1\n", 437 | " options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL\n", 438 | "\n", 439 | " # Load the model as a graph and prepare the CPU backend \n", 440 | " session = InferenceSession(model_path, options, providers=[provider])\n", 441 | " session.disable_fallback()\n", 442 | " \n", 443 | " return session" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 25, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "session_cpu = create_model_for_provider(onxx_model_path, \"CPUExecutionProvider\") # 使用 优化过的 onnx" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 27, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "0.17477822303771973\n", 465 | "CPU times: user 171 ms, sys: 4.02 ms, total: 175 ms\n", 466 | "Wall time: 175 ms\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "%%time\n", 472 | "s0 = time.time()\n", 473 | "res = session_cpu.run(None, inputs_onnx)\n", 474 | "print(time.time() - s0)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 28, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "17.469280004501343 0.17469280004501342\n", 487 | "CPU times: user 16.9 s, sys: 209 ms, total: 17.1 s\n", 488 | "Wall time: 17.5 s\n" 489 | ] 490 | } 491 | ], 492 | "source": [ 493 | "%%time\n", 494 | "s0 = time.time()\n", 495 | "for _ in sample_text:\n", 496 | " res = session_cpu.run(None, inputs_onnx)\n", 497 | "all_time = time.time() - s0\n", 498 | "print(all_time, all_time / len(sample_text))" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [] 507 | } 508 | ], 509 | "metadata": { 510 | "kernelspec": { 511 | "display_name": "Python 3", 512 | "language": "python", 513 | "name": "python3" 514 | }, 515 | "language_info": { 516 | "codemirror_mode": { 517 | "name": "ipython", 518 | "version": 3 519 | }, 520 | "file_extension": ".py", 521 | "mimetype": "text/x-python", 522 | "name": "python", 523 | "nbconvert_exporter": "python", 524 | "pygments_lexer": "ipython3", 525 | "version": "3.6.5" 526 | } 527 | }, 528 | "nbformat": 4, 529 | "nbformat_minor": 4 530 | } 531 | -------------------------------------------------------------------------------- /notes/CS224N-2019/CS224N-01-Introduction-and-Word-Vectors.md: -------------------------------------------------------------------------------- 1 | ## CS224N-01-Introduction and Word Vectors 2 | 3 | **[CS224N Home](https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/)** 【Stanford NLP】 4 | **[Video](https://www.bilibili.com/video/BV1r4411f7td)** 【Bilibili】 5 | 6 | 第一课的内容主要包含两个方面,一个是介绍如何表达词的含义,从传统的词表示方法引入分布式语义表达,引出word vector,第二个方面是讲解了word2vec的原理,从word2vec的损失函数和计算推导方面剖析了word vector的原理。尤其是最后的损失函数推导计算,从公式方面解释了优化词向量的内涵。 7 | 8 | ### 1、Human language and word meaning 9 | 第一部分从传统的词表示引入到word vector,传统的表示方式是独热编码,由分布式语义产出词向量。 10 | * XKCD cartoon 11 | 12 | 13 | #### Definition: **meaning** (Webster dictionary) 14 | #### Common solution: **WordNet** 15 | #### Problems with resources like WordNet: 16 | * Great as a resource but missing nuance, 细微差别 17 | * missing new meaning of words, 单词含义 18 | * Subjective, 主观的 19 | * Requires human labor to create and adapt, 需要人工 20 | * Can't compute accurate word similarity, 无法计算相似度 21 | 22 | #### Representing words as discrete symbols 23 | traditional NLP,a localist representation 24 | **Means one 1, the rest 0s** 25 | 独热编码(ont-hot) 26 | ``` 27 | motel=[0 0 0 0 1 0] 28 | hotel=[0 1 0 0 0 0] 29 | ``` 30 | 但是独热编码的结果是,这些词向量都是正交的,并且不能表达语义相似度。orthogonal(正交)、no natural notion of similarity 31 | 解决方案就是`learn to encode similarity in the vectors themselves` 32 | 33 | #### Representing words by their context 34 | 35 | 36 | **Distributional semantics**: A word's meaning is given by the words that frequently appear close-by 37 | **Word vectors** (word embeddings): dense vector 38 | 39 | #### Word meaning as as neural word vector - visualization 40 | 41 | 42 | ### 2、Word2vec: Overview 43 | **Word2vec (Mikolov et al. 2013) is a framework for learning word vectors.** 44 | **Idea:** 45 | * a large corpus of text,首先有一个语料库 46 | * 每个词给一个初始化的vector 47 | * 遍历text中的每个位置,包含了center word [c]和context words [o] 48 | * 根据c和o的词向量的相似度来计算,给出c得出o的似然 49 | * 调整优化word vectors来最小化似然 50 | 51 | 图示: 52 | 53 | 计算$P(w_{t+j}|w_t)$ 54 | #### Word2vec: objective function 55 | 对于每一个text的位置$t=1,...,T$,给出中心词$w_j$,预测窗口为m内的上下文。 56 | 其似然值为: 57 | $$Likelihood=L(\theta)=\prod_{t=1}^T\prod_{-m\le{j}\le{m} \atop{j\ne0}}P(w_{t+j}|w_t)$$ $\theta$ is all variables to be optimized. 58 | 损失函数$J(\theta)$是(平均)负的对数似然,**negative log likelihood**: 59 | $$J(\theta)=-\frac{1}{T}logJ(\theta)=-\frac{1}{T}\sum_{t=1}^T\sum_{-m\le{j}\le{m} \atop{j\ne0}}P(w_{t+j}|w_t)$$**Minimizing objective function <==> Maxmizing predictive accuracy** 60 | 61 | 想要最小化损失函数,首先要考虑怎么计算$P(w_{t+j}|w_t)$ 62 | 对于每个词给定两个词向量 63 | * $v_w$,当w为中心词时 64 | * $u_w$,当w为上下文时 65 | 66 | 对于每个中心词c和上下文词o,有: 67 | $$P(o|c)=\frac{exp(u_o^Tv_c)}{\sum_{w\in{V}}exp(u_w^Tv_c)}$$分子上的向量点乘表达的是两个词的相似度,分母是中心词和所有词的相似度(**注意:这里是所有词,后续优化**) 68 | 69 | **softmax** function:为什么成为softmax 70 | 71 | 72 | #### To train the model: Compute all vector gradients 73 | $\theta$ represents all model parameters, in one long vector 74 | Remember: every word has two vectors 75 | 76 | 上述推导中,$P(x|c)$是给定中心词 $c$,模型所给出的为 $x$ 的概率。 77 | 78 | 这个推导结果很有趣!等号左边是给出中心词 $c$ 其上下文 $o$ 的对数概率的偏导,是我们要找的一个下降对快的一个方向,多维空间上的一个斜坡。等号右边的含义是,我们观察到的上下文的词 $o$ ,从中减去我们的模型认为的上下文的样子,后面一部分是模型的期望。实际的上下文与模型认为的上下文,这两者之间的差异决定了下降的方向。 -------------------------------------------------------------------------------- /notes/CS224N-2019/img/2020-06-12-12-42-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-12-42-12.png -------------------------------------------------------------------------------- /notes/CS224N-2019/img/2020-06-12-13-22-31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-13-22-31.png -------------------------------------------------------------------------------- /notes/CS224N-2019/img/2020-06-12-13-24-36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-13-24-36.png -------------------------------------------------------------------------------- /notes/CS224N-2019/img/2020-06-12-15-11-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-15-11-11.png -------------------------------------------------------------------------------- /notes/CS224N-2019/img/2020-06-12-15-37-09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-15-37-09.png -------------------------------------------------------------------------------- /notes/CS224N-2019/img/2020-06-12-15-42-22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-15-42-22.png -------------------------------------------------------------------------------- /notes/CS224N-2019/img/微信图片_20200612171355.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/微信图片_20200612171355.jpg -------------------------------------------------------------------------------- /notes/CS224N-2019/img/微信截图_20200612183129.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/微信截图_20200612183129.png -------------------------------------------------------------------------------- /notes/Word2Vec学习笔记(CS224N笔记及相关论文学习).md: -------------------------------------------------------------------------------- 1 | ***[参考CS224N笔记](https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/readings/cs224n-2019-notes01-wordvecs1.pdf) 2 | [The Skip-Gram Model](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/) 3 | [word2vec paper](https://arxiv.org/pdf/1301.3781.pdf) 4 | [negative sampling paper](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)*** 5 | 6 | @[toc] 7 | ### NLP 8 |   人类语言是独特的传达含义的系统,不同于计算机视觉及其他的机器学习任务。 9 |   NLP领域有着不同难度等级的任务,从语音处理到语义解释等。NLP的目标是设计出算法令计算机“理解”自然语言以解决实际的任务。 10 | - Easy的任务包括:拼写纠正、关键词搜索、同义词查找等; 11 | - Medium的任务包括:信息解析等; 12 | - Hard任务包括:机器翻译、情感分析、指代、问答系统等。 13 | 14 | ### 1、Word Vectors 15 |   英语中估计有13 million单词,他们相互之间并不全是无关的,Feline to cat (猫科动物->猫)、hotel to motel (旅馆->汽车旅馆)等。我们希望用一些向量来编码每个单词,在同一词空间内以点的形式进行表示。直接的方法是构建一个$N(N\le13 million)$维度的空间,这个空间足够将我们的单词进行编码,每个维度可以编码某些我们语言的含义。这些维度可能表示时态、计数、性别等。 16 |   独热编码是直接的编码方法,将每个词表示为$\mathbb{R}^{|V|\times1}$向量,该词在固定顺序下的索引处为1,其他位置都为0。如下 17 |
18 |   每个单词是一个完全独立的个体,如上图所示,结果就是这种词表示方式不能表示任何相似之处,他们都是正交的: 19 | 20 | $$(w^{hotel})^Tw^{motel}=(w^{hotel})^Tw^{cat}=0$$ 21 | 22 |   我们可以尝试将$V$维减小,对原来表示空间进行降维,来寻找一个子空间来进行词关系的表示。 23 | ### 2、SVD Based Methods 24 |   奇异值分解方法的做法是,我们首先遍历数据集,通过矩阵$X$存储单词出现的共现次数,然后对$X$进行奇异值分解得出$USV^t$分解。我们可以将$U$的行值可以作为词表中所有词的word embedding。 25 | #### 2.1 Word-Document Matrix 26 |   首先我们可以认为,相关的词总会出现在同一个文档中。譬如,"bank"、"bongs"、"stocks"、"money"等更有可能同时出现,但是"bank"、"octopus"、"banana"等词不可能总是同时出现。我们利用这种共现现象构建一个word-document matrix:X。遍历大量的文档数据集,每当单词$i$和单词$j$同时出现时,我们就在$X_{ij}$位置加1。很明显这将是一个非常大的矩阵($\mathbb{R}^{|V|\times{M}}$),其中$M$是文档的个数。 27 | #### 2.2 Window based Co-occurrence Matrix(基于窗口的共现矩阵) 28 |   矩阵$X$存储着单词的共现次数。这里我们将在一个合适大小的窗口内来统计单词的共现次数。通过下面的例子进行说明,数据集中有三个句子,窗口大小设定为1: 29 | ``` 30 | 1. I enjoy flying. 31 | 2. I like NLP. 32 | 3. I like deep learning. 33 | ``` 34 | 根据窗口为1的设定,统计结果矩阵如下: 35 |
36 | 37 | #### 2.3 奇异值分解 38 | 通过SVD方法得到word embedding的过程如下: 39 | * 构建$|V|\times{|V|}$的共现矩阵,$X$。 40 | * 使用SVD得到,$X=USV^{T}$。 41 | * 选择$U$的前$k$个维度,得到$k$维的词向量。 42 | * $\frac{\sum_{i=1}^{k}\sigma_i}{\sum_{i=1}^{|V|}\sigma_i}$表示前$k$个维度的方差。 43 | 44 | 我们现在对$X$进行SVD处理: 45 | $X=USV^{T}$ 46 |
47 | 选择k维奇异值向量进行降维: 48 |
49 | 50 | #### 2.4 SVD方法小结 51 |   以上的两种方法(Word-Document Matrix 和 Window based Co-occurrence Matrix)都比传统的编码形式有着跟多的语义信息,但是仍然存在着一些问题: 52 | * 矩阵的维度大小不固定,会随新词的添加而变化,语料库大小也随之变化; 53 | * 矩阵过于稀疏,大部分的单词不会同时出现; 54 | * 矩阵维度太高($\approx10^6\times{10^6}$); 55 | * 训练成本太高($O(mn^2)$); 56 | * 需要加入一些隐含词(不知道这么理解对不对)来解决词频不均衡的问题。 57 | 58 | 针对以上的问题有一些解决方法: 59 | * 忽略一些词,例如"the"、"he"、"has"等; 60 | * 窗口动态,即根据文档中单词之间的距离加权计算共现计数; 61 | * 使用皮尔逊相关系数,Use Pearson correlation and set negative counts to 0 instead ofusing just raw count. 62 | 63 | ### 3、Iteration Based Methods - Word2vec 64 |   我们尝试一种新得方法,通过构建模型能够迭代学习,最终可以根据给定的上下文来对单词的概率进行编码。这个方法设计出的模型的参数就是词向量。在每次的训练迭代过程中,计算误差,更新参数,最终学习出词向量。这个想法可以追溯到1986年,称之为“反向传播(backpropagating)”[[Rumelhart et al., 1988](#refer)],模型任务越简单,训练速度越快。有一些方法也被尝试过,[[[Collobert et al., 2011](#refer)]构建了NLP模型,第一步是将每个词转为向量,对于每种任务(命名实体识别,词性标注等)不仅训练模型参数同时训练向量,在有不错的效果的同时也得到了好的词向量。 65 |   Word2vec是2013年Mikolov提出的简单有效的方法[[Mikolov et al., 2013](#refer)](这种方法依赖于语言学中一个非常重要的假设,即分布相似,即相似的词有相似的语境。)Word2vec是一个算法包: 66 | * 算法部分:continuous bag-of-words (CBOW) and skip-gram. CBOW是通过上下文预测中心词,Skip-gram相反,给定中心词预测上下文。 67 | * 模型训练: negative sampling and hierarchical softmax. 负采样是采样出一定比例的负例,层次softmax是通过一种有效的霍夫曼树结构来计算词的概率。 68 | 69 | #### 3.1 语言模型(unigrams,bigrams,trigrams等) 70 |
"The cat jumped over the puddle."
71 |   以上面的句子为例。 72 | 73 |   首先,我们需要构建一个模型来表示一个单词序列的概率。一个好的语言模型会给有效的好句子一个高的概率值,但是句子"stock boil fish is toy"的概率会很低,因为这不是一个正常有意义的句子。用数学来表达,当给定一个有$n$个单词的句子时,其概率为: 74 | 75 | $$P(w_1,w_2,...,w_n)$$ 76 | 77 | 我们采用unigrams(一元模型),即每个单词都是独立的,则: 78 | $$P(w_1,w_2,...,w_n)=\prod_{i=1}^{n}P(w_i)$$ 79 | 80 |   这个表达式有个明显的问题就是,如果有一组句子,虽然他们有着同样的单词,有的句子有意义,有的句子是乱序无意义的,但是他们的概率确实一样的。因为我们的句子都是有序的,一个单词的概率很大程度上和上一个单词有关系。我们需要基于相邻的两个单词的概率来决定句子的概率,即bigrams(二元模型): 81 | $$P(w_1,w_2,...,w_n)=\prod_{i=2}^{n}P(w_i|w_{i-1})$$ 82 | 83 | 即使这样,我们考虑的也是两两相邻的单词,而不是整个句子。 84 | #### 3.2 Continuous Bag of Words Model (CBOW) 85 |   对于上述的例子,我们通过上下文{"The"、"cat"、"over"、"the"、"puddle"}来预测或生成出中心词"jumped",这种方式我们成为Continuous Bag of Words Model (CBOW)。 86 |   对于CBOW模型,首先我们设定已知参数,即将输入句子表示为一个one-hot形式的词向量。输入的one-hot向量表示为$x^{(c)}$,输出表示为$y^{(c)}$,CBOW模型只有一个输出,这个$y$为已知的中心词的one-hot向量。对于每个词,我们通过CBOW都可以学习出两个向量, 87 | * $v$:input vector,当词为上下文时 88 | * $u$:output vector,当词为中心词时 89 | 90 | 首先介绍一些CBOW模型中涉及到的一些参数: 91 | * $w_i$:词表$V$中的第$i$个词 92 | * $\mathcal{V}\in{\mathbb{R}^{n\times{|V|}}}$:input word matrix 93 | * $v_i$:$\mathcal{V}$中的第$i$行,表示的是$w_i$的输入向量 94 | * $\mathcal{U}\in{\mathbb{R}^{|V|\times{n}}}$:output word matrix 95 | * $u_i$:$\mathcal{U}$中的第$i$行,表示的是$w_i$的输出向量 96 | 97 |   我们构建两个矩阵$\mathcal{V}\in{\mathbb{R}^{n\times{|V|}}}$和$\mathcal{U}\in{\mathbb{R}^{|V|\times{n}}}$,其中$n$是我们定义的embedding空间的大小。具体的模型构建步骤如下: 98 | 1. 首先我们根据窗口大小$m$确定我们的输入one-hot词向量:$(x^{(c-m)},...x^{(c-1)},x^{(c+1)},...,x^{(c+m)}\in{\mathbb{R}^{|V|}})$,中心词为$x^{(c)}$ 99 | 2. 得到对应的输入word embedding为$(v_{c-m}=\mathcal{Vx^{(c-m)}},v_{c-m+1}=\mathcal{Vx^{(c-m+1)}},...,v_{c+m}=\mathcal{Vx^{(c+m)}}\in{\mathbb{R}^{n}})$ 100 | 3. 将这些向量平均得到$\hat{v}=\frac{v_{c-m}+v_{c-m+1}+...+v_{c+m}}{2m}\in{\mathbb{R}^{n}}$ 101 | 4. 计算出分数向量$z=\mathcal{U}\hat{v}\in{\mathbb{R}^{|V|}}$,点乘计算的是两个向量的相似度,如果两个词比较接近,那么将会有一个较高的分数 102 | 5. 通过softmax将分数转为概率值,$\hat{y}=softmax(z)\in{\mathbb{R}^{|V|}}$ 103 | 6. 我们希望生成的概率$\hat{y}$来匹配真实的概率$y$,即输出的对应的one-hot向量对应真实的单词 104 | 105 | 如图展示了CBOW模型细节,我们需要学习出两个转换矩阵。 106 |
107 | 108 |   我们需要学习出 $\mathcal{V}$ 和 $\mathcal{U}$ 这两个矩阵,首先确定目标函数。当我们试图从某个真实的概率中学习概率时,会考虑用信息论的方法来度量两个分布的距离,我们这里选用交叉熵(cross entropy)$H(\hat{y},y)$来作为目标函数: 109 | $$H(\hat{y},y)=-\sum_{j=1}^{|V|}y_jlog(\hat{y}_j)$$ 110 | 111 | $y$是一个one-hot向量,简化目标函数为: 112 | $$H(\hat{y},y)=-y_jlog(\hat{y}_j)$$ 113 | 114 | 因此我们优化目标为: 115 |
116 | 117 | 我们使用随机梯度下降来更新所有相关的词向量 $u_c$ 和 $v_j$。 118 | #### 3.3 Skip-Gram Model 119 |   Skip-gram是给出中心词"jumped",来预测或生成上下文词 "The", "cat", "over", "the", "puddle"。Skip-gram model大体上和COBW模型相似,不过我们需要将$x$与$y$互换,即这里输入的one-hot向量是一个,输出向量$y$是多个。我们同样定义两个矩阵 $\mathcal{V}$ 和 $\mathcal{U}$,模型构建步骤如下: 120 | 1. 首先生成中心词输入向量$x\in{\mathbb{R}^{|V|}}$ 121 | 2. 得到中心词的embedding词向量 $v_c=\mathcal{V}x\in{\mathbb{R}^n}$ 122 | 3. 生成分数向量$z=\mathcal{U}v_c$ 123 | 4. 转为概率值 $\hat{y}=softmax(z)$,$\hat{y}_{c-m},...,\hat{y}_{c-1},\hat{y}_{c+1},...,\hat{y}_{c+m}$是每个上下文词的概率值 124 | 5. 目标是让概率分布与真实的接近 125 |
126 | 和CBOW模型一样,我们需要确定目标函数,这里我们使用朴素贝叶斯估计来求解出结果。 127 |
128 | 利用这个目标函数,我们可以计算出未知参数的梯度,并在每次迭代时通过随机梯度下降来更新它们。 129 | 注意到: 130 |
131 | 132 | 其中 $H(\hat{y},y_{c-m+j})$ 是概率分布向量 $\hat{y}$ 和one-hot向量 $y_{c-m+j}$ 的交叉熵。 133 | #### 3.4 Negative Sampling 134 |   我们注意到目标函数中的 $|V|$ 的值是非常大的,结果就是每次更新或评估目标函数的时候我们都要花费 $O(|V|)$(计算softmax归一化的时候),一个简单的做法就是近似估计它就可以了。 135 |   在每次训练的时候,我们不需要遍历所有的词表,只需要采样少数的负样本。我们基于噪声分布 $P_n(w)$ 采样,其采样概率和词频顺序相匹配。 136 |   Negative Sampling见[paper](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)。负采样基于Skip-Gram模型,实际上是优化不同的目标。考虑中心词和上下文词对$(w,c)$,如果这个词对来自语料数据集,则概率为$P(D=1|w,c)$,相反,如果词对不是来自语料库的,则为$P(D=0|w,c)$,首先,利用sigmoid函数表示概率值: 137 | $$P(D=1|w,c,\theta)=\sigma(v_c^{T}v_w)=\frac{1}{1+e^{-v_c^{T}v_w}}$$ 138 | 139 | 我们现在构建一个新的目标函数,其目标是maximize两个概率值 $P(D=1|w,c,\theta)$ 和 $P(D=0|w,c,\theta)$,我们利用最大化似然来估计这两个概率分布(这么我们将$\theta$作为模型的参数,在这里表示是 $\mathcal{V}$ 和 $\mathcal{U}$) 140 |
141 | 142 | 等同于最小化负的对数似然: 143 |
144 | 145 | 公式中的$\tilde{D}$是负样本集。 146 | 对于skip-gram模型,对于给定中心词$c$和上下文词 $c-m+j$表示为: 147 |
148 | 149 | 对于CBOW模型,中心词为$u_c$,给定的上下文向量为$\hat{v}=\frac{v_{c-m}+v_{c-m+1}+...+v_{c+m}}{2m}$,目标函数为: 150 |
151 | 152 |   现在讨论$P_n(w)$应该是什么。相关大量的讨论似乎是一元模型中的$3/4$次方是最优,为什么是$3/4$,如下: 153 |
154 | 155 | "bombastic"的抽样率变成了3倍,但是"is"只是增大了一点点。"is"是不重要的一类词,其出现的概率本来就很大,不需要对其增加很多采样。 156 | 157 | #### 3.5 Hierarchical Softmax 158 |   Mikolov同样提出了层次softmax来解决归一化softmax的问题。**在实际中,层次softmax对低频词汇有更好的效果,负采样对高频词和低维向量有着更好的效果。** 159 |
160 | 161 |   层次softmax利用二叉树来表示词表中的所有词,树的每个叶子都是一个单词,从根到叶子节点只有唯一的一条路径。每个词没有输出表示,图的每个节点(除了根和叶)都是模型要学习的向量。 162 |   在层次softmax中,单词$w$的向量为$w_i$。$P(w|w_i)$是从根随机游走到叶子节点$w$的概率。最大的优点就是这种计算概率的方式其成本为$O(log(|V|))$,与路径长度相关。 163 |   令$L(w)$为从根到叶子$w$路径上的节点个数,令$n(w,i)$为路径上的第$i$个节点。因此,$n(w,1)$是根节点,$n(w,L(w))$表示的是节点$w$。对于每个节点$n$,我们可以选择其的一个孩子称为$ch(n)$(总是左节点)。我们计算$P(w|w_i)$为: 164 |
165 | 其中: 166 |
167 | 168 | $\sigma(\cdot)$是sigmoid函数。 169 |   分析上述的公式,首先,我们根据根到叶子节点的路径上各项的乘积。因为我们假设了$ch(n)$总是$n$的左节点,因此当路径游走到左节点时$[n(w,j+1)=ch(n(w,j))]$为1,游走到右边为-1。 170 |   此外,$[n(w,j+1)=ch(n(w,j))]$是一种归一化的方式。对于节点$n$,计算游走到左边的概率和右边的概率,对于每个$v_n^Tv_{w_i}$的概率都是1: 171 | $$\sigma(v_n^Tv_{w_i})+\sigma(-v_n^Tv_{w_i})=1$$ 172 | 173 | 这样确保了$\sum_{w=1}^{|V|}P(w|w_i)=1$,这是原本的softmax。 174 |   最后,我们比较利用点乘来比较输入向量$v_{w_i}$和每个内部的节点向量$v_{n(w,j)}^T$的相似度。对于二叉树图示中的例子来讲,$w_2$,我们需要从根部走两个左边和一个右边达到$w_2$: 175 |
176 | 177 |   训练模型的时候,我们目标依然是最小化负对数似然:$-logP(w|w_i)$,但是这里我们不需要更新每个单词的向量,只需要更新该路径上经过的节点的向量即可。 178 |   这种方法的速度取决于二叉树的构造方式和单词分配给叶节点的方式。Mikolovlion利用二叉霍夫曼树,其特点是高频词在树中有着更短的路径。 179 | 180 |
181 | 182 | ### References 183 | [Rumelhart et al., 1988] Rumelhart, D. E., Hinton, G. E., and Williams, R. J. (1988).Neurocomputing: Foundations of research. chapter Learning Representations by Back-propagating Errors, pages 696-699. MIT Press, Cambridge, MA, USA. 184 | [Collobert et al., 2011] Collobert, R., Weston, J., Bottou, L., Karlen, M., Kavukcuoglu, K., and Kuksa, P. P. (2011). Natural language processing (almost) from scratch. CoRR, abs/ 1103. 0398. 185 | [Mikolov et al., 2013] Mikolov, T., Chen, K., Corrado, G., and Dean, J. (2013). Efficient estimation of word representations in vector space. CoRR, abs/ 1301. 3781. 186 | 187 | -------------------------------------------------------------------------------- /text_classification/examples/test_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import tensorflow as tf 6 | import pickle 7 | import time 8 | import os 9 | import sys 10 | sys.path.append('..') 11 | 12 | from utils import data_helper 13 | from models.FastText import FastText 14 | from models.TextCNN import TextCNN 15 | from models.TextRNN import TextRNN 16 | from models.TextBiLSTM import TextBiLSTM 17 | 18 | FLAGS = tf.app.flags.FLAGS 19 | # Data params 20 | tf.app.flags.DEFINE_string('data_path', '../text_data/input_data/', 'input data path') 21 | # Model params 22 | tf.app.flags.DEFINE_string("filter_sizes", "2,3,4", "textcnn model, convolution filter sizes") 23 | tf.app.flags.DEFINE_integer("num_filters", 2, "textcnn model, convolution filter nums") 24 | tf.app.flags.DEFINE_integer("num_classes", 2, "num_classes") 25 | tf.app.flags.DEFINE_float("keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 26 | tf.app.flags.DEFINE_integer("hidden_num", 2, "Number of RNNCell num") 27 | tf.app.flags.DEFINE_integer("hidden_size", 2, "Number of RNN layers") 28 | # Training params 29 | tf.app.flags.DEFINE_float("learning_rate", 0.01, "learning_rate (default: 0.01)") 30 | tf.app.flags.DEFINE_integer("epochs", 10, "Number of training epochs (default: 10)") 31 | tf.app.flags.DEFINE_integer("batch_size", 512, "Batch Size (default: 64)") 32 | tf.app.flags.DEFINE_integer("checkpoint_every", 100, "Save model every steps (default: 100)") 33 | tf.app.flags.DEFINE_string("checkpoint_dir", './model_save/', "checkpoint_dir") 34 | 35 | train_x, train_y, valid_x, valid_y, embedding, word2index, index2word, vocab_size, maxlen = data_helper.load_data('../text_data/input_data/') 36 | print(train_x.shape) 37 | print(vocab_size) 38 | print(embedding.shape) 39 | print(embedding.dtype) 40 | print(maxlen) 41 | 42 | 43 | # model = FastText( 44 | # num_classes=FLAGS.num_classes, 45 | # sequence_length=maxlen, 46 | # w2v_model_embedding=embedding, 47 | # vocab_size=vocab_size, 48 | # embedding_size=200) 49 | 50 | # model = TextCNN(filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 51 | # num_filters=FLAGS.num_filters, 52 | # num_classes=FLAGS.num_classes, 53 | # sequence_length=maxlen, 54 | # w2v_model_embedding=embedding, 55 | # vocab_size=vocab_size, 56 | # embedding_size=200) 57 | 58 | # model =TextRNN(num_classes=FLAGS.num_classes, 59 | # sequence_length=maxlen, 60 | # w2v_model_embedding=embedding, 61 | # vocab_size=vocab_size, 62 | # embedding_size=200, 63 | # hidden_num=FLAGS.hidden_num, 64 | # hidden_size=FLAGS.hidden_size, 65 | # keep_prob=0.5) 66 | 67 | model =TextBiLSTM(num_classes=FLAGS.num_classes, 68 | sequence_length=maxlen, 69 | w2v_model_embedding=embedding, 70 | vocab_size=vocab_size, 71 | embedding_size=200, 72 | hidden_num=FLAGS.hidden_num, 73 | keep_prob=0.5) 74 | 75 | optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) 76 | model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy') 77 | model.fit(train_x, train_y, 78 | batch_size=128, 79 | epochs=2, 80 | verbose=1, 81 | valid_x=valid_x, 82 | valid_y=valid_y, 83 | ) 84 | predict_scores = model.predict(train_x) 85 | print(predict_scores[:5]) 86 | 87 | 88 | -------------------------------------------------------------------------------- /text_classification/models/BaseModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | class TextClassifierBaseModel(object): 8 | def __init__(self, num_classes=None, sequence_length=None, 9 | w2v_model_embedding=None, vocab_size=None, embedding_size=200, 10 | initializer=tf.random_normal_initializer(stddev=0.1), 11 | l2_reg_lambda=0.001): 12 | self.num_classes = num_classes 13 | self.sequence_length = sequence_length 14 | if w2v_model_embedding is not None: 15 | self.w2v_model_embedding = tf.cast(w2v_model_embedding, tf.float32) 16 | else: 17 | self.w2v_model_embedding = None 18 | self.vocab_size = vocab_size 19 | self.embedding_size = embedding_size 20 | self.initializer = initializer 21 | self.l2_reg_lambda = l2_reg_lambda 22 | self.l2_loss = tf.constant(0.0) 23 | 24 | self.input_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='input_x') 25 | self.input_y = tf.placeholder(tf.int32, [None, self.num_classes], name='label') 26 | 27 | self.logits = None 28 | 29 | def _initialize_embedding(self): 30 | with tf.name_scope('embedding'): 31 | if self.w2v_model_embedding is None: 32 | self.Embedding = tf.get_variable(name='embedding', 33 | shape=[self.vocab_size, self.embedding_size], 34 | initializer=self.initializer) # [vocab_size, embedding_size] 35 | else: 36 | self.Embedding = tf.get_variable(name='embedding', 37 | initializer=self.w2v_model_embedding, 38 | dtype=tf.float32) 39 | 40 | def _initialize_weights(self): 41 | with tf.name_scope('weights'): 42 | self.W = tf.get_variable(name='W', 43 | shape=[self.embedding_size, self.num_classes], 44 | initializer=self.initializer) 45 | self.b = tf.get_variable(name='b', shape=[self.num_classes]) 46 | 47 | def _inference(self): 48 | sentence_embedding = tf.nn.embedding_lookup(self.Embedding, self.input_x) 49 | 50 | self.sentence_embedding = tf.reduce_mean(sentence_embedding, axis=1) # [None, self.embedding_size] 51 | 52 | with tf.name_scope('output'): 53 | logits = tf.matmul(self.sentence_embedding, self.W) + self.b 54 | return logits 55 | 56 | def _loss(self): 57 | with tf.name_scope('loss'): 58 | self.l2_loss += tf.nn.l2_loss(self.b) 59 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) 60 | loss = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss 61 | return loss 62 | 63 | def _accuracy(self): 64 | with tf.name_scope('accuracy'): 65 | self.prediction = tf.argmax(self.logits, 1, name='prediction') 66 | correct_predictions = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.input_y, 1)) 67 | accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy') 68 | return accuracy 69 | 70 | def compile(self, optimizer, loss, metrics=None): 71 | if loss == 'binary_crossentropy': 72 | self.loss= self._loss() 73 | if metrics == 'accuracy': 74 | self.accuracy = self._accuracy() 75 | grads_and_vars = optimizer.compute_gradients(self.loss) 76 | self.train_op = optimizer.apply_gradients(grads_and_vars) 77 | 78 | def _next_batch(self, train_x, train_y=None, epochs=1, batch_size=None, shuffle=True): 79 | data_size = len(train_x) 80 | num_batches_per_epoch = int(data_size / batch_size) + 1 81 | 82 | for _ in range(epochs): 83 | if shuffle: 84 | shuffle_indices = np.random.permutation(np.arange(data_size)) 85 | shuffled_data = train_x[shuffle_indices] 86 | if train_y is not None: 87 | shuffled_data_y = train_y[shuffle_indices] 88 | else: 89 | shuffled_data = train_x 90 | if train_y is not None: 91 | shuffled_data_y = train_y 92 | 93 | for batch_num in range(num_batches_per_epoch): 94 | start_index = batch_num * batch_size 95 | end_index = min((batch_num + 1) * batch_size, data_size) 96 | 97 | if train_y is None: 98 | yield shuffled_data[start_index:end_index] 99 | else: 100 | yield shuffled_data[start_index:end_index], shuffled_data_y[start_index:end_index] 101 | 102 | def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, valid_x=None, valid_y=None, checkpoint_dir=None): 103 | config = tf.ConfigProto() 104 | config.gpu_options.allow_growth = True 105 | self.sess = tf.Session(config=config) 106 | if checkpoint_dir: 107 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) 108 | checkpoint_dir = os.path.join(checkpoint_dir, "checkpoints") 109 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 110 | 111 | if os.path.exists(checkpoint_dir): 112 | print("Restoring Variables from Checkpoint.") 113 | saver.restore(self.sess, tf.train.latest_checkpoint(checkpoint_dir)) 114 | else: 115 | print('Initializing Variables') 116 | self.sess.run(tf.global_variables_initializer()) 117 | os.makedirs(checkpoint_dir) 118 | else: 119 | self.sess.run(tf.global_variables_initializer()) 120 | 121 | train_step = 0 122 | for epoch in range(epochs): 123 | step = 0 124 | for batch_x, batch_y in self._next_batch(x, y, batch_size=batch_size): 125 | feed_dict = {self.input_x: batch_x, 126 | self.input_y: batch_y, 127 | } 128 | self.sess.run([self.loss, self.accuracy, self.train_op], feed_dict) 129 | train_step += 1 130 | step += 1 131 | 132 | if step % verbose == 0: 133 | feed_dict = {self.input_x: batch_x, 134 | self.input_y: batch_y, 135 | } 136 | train_loss, train_acc = self.sess.run([self.loss, self.accuracy], feed_dict) 137 | 138 | if valid_x is not None: 139 | feed_dict = {self.input_x: valid_x, 140 | self.input_y: valid_y, 141 | } 142 | val_loss, val_acc = self.sess.run([self.loss, self.accuracy], feed_dict) 143 | print('Epoch {}\tBatch {}\tTrain Loss:{:.4f}\tTrain Acc:{:.4f}\tValid Loss:{:.4f}\tValid Acc:{:.4f}'.format( 144 | epoch, step, train_loss, train_acc, val_loss, val_acc)) 145 | else: 146 | print('Epoch {}\tBatch {}\tTrain Loss:{:.4f}\tTrain Acc:{:.4f}'.format(epoch, step, train_loss, train_acc)) 147 | 148 | if checkpoint_dir: 149 | if train_step % 50 == 0: 150 | print("Going to save model..") 151 | saver.save(self.sess, checkpoint_prefix, global_step=train_step) 152 | 153 | def predict(self, x, batch_size=None, verbose=0, checkpoint_dir=None): 154 | predict_scores = [] 155 | if not checkpoint_dir: 156 | sess = self.sess 157 | else: 158 | print('Restore model from checkpoint.') 159 | config = tf.ConfigProto() 160 | config.gpu_options.allow_growth = True 161 | sess = tf.Session(config=config) 162 | saver = tf.train.Saver() 163 | checkpoint_dir = os.path.join(checkpoint_dir, "checkpoints") 164 | saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir)) 165 | 166 | if batch_size is None: 167 | predict_scores = sess.run(self.logits, feed_dict={self.input_x: x}) 168 | else: 169 | for batch_x in self._next_batch(x, batch_size=batch_size): 170 | batch_result = sess.run(self.logits, feed_dict={self.input_x: batch_x}) 171 | predict_scores += batch_result.tolist() 172 | 173 | return np.array(predict_scores) 174 | 175 | -------------------------------------------------------------------------------- /text_classification/models/FastText.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from .BaseModel import TextClassifierBaseModel 8 | 9 | class FastText(TextClassifierBaseModel): 10 | def __init__(self, num_classes, sequence_length, 11 | w2v_model_embedding, vocab_size, embedding_size, 12 | initializer=tf.random_normal_initializer(stddev=0.1), 13 | l2_reg_lambda=0.001): 14 | super(FastText, self).__init__(num_classes=num_classes, sequence_length=sequence_length, 15 | w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size, 16 | initializer=tf.random_normal_initializer(stddev=0.1), 17 | l2_reg_lambda=0.001) 18 | 19 | self._initialize_embedding() 20 | self._initialize_weights() 21 | self.logits = self._inference() 22 | print(self.logits) 23 | 24 | def _inference(self): 25 | sentence_embedding = tf.nn.embedding_lookup(self.Embedding, self.input_x) 26 | 27 | self.sentence_embedding = tf.reduce_mean(sentence_embedding, axis=1) # [None, self.embedding_size] 28 | 29 | with tf.name_scope('output'): 30 | logits = tf.matmul(self.sentence_embedding, self.W) + self.b 31 | return logits 32 | -------------------------------------------------------------------------------- /text_classification/models/TextBiLSTM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from .BaseModel import TextClassifierBaseModel 7 | 8 | class TextBiLSTM(TextClassifierBaseModel): 9 | def __init__(self, num_classes, sequence_length, 10 | w2v_model_embedding, vocab_size, embedding_size, 11 | hidden_num, keep_prob, 12 | initializer=tf.random_normal_initializer(stddev=0.1), 13 | l2_reg_lambda=0.001): 14 | super(TextBiLSTM, self).__init__(num_classes=num_classes, sequence_length=sequence_length, 15 | w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size, 16 | initializer=tf.random_normal_initializer(stddev=0.1), 17 | l2_reg_lambda=0.001) 18 | 19 | self.hidden_num = hidden_num 20 | self.keep_prob = keep_prob 21 | 22 | self._initialize_embedding() 23 | self._initialize_weights() 24 | self.logits = self._inference() 25 | 26 | def _initialize_weights(self): 27 | with tf.name_scope('weights'): 28 | self.W = tf.get_variable(name='W', 29 | shape=[self.hidden_num * 2, self.num_classes], 30 | initializer=self.initializer) 31 | self.b = tf.get_variable(name='b', shape=[self.num_classes]) 32 | 33 | def _inference(self): 34 | 35 | self.embedding_words = tf.nn.embedding_lookup(self.Embedding, self.input_x) 36 | 37 | rnn_drop = self._bilstm_layer() 38 | 39 | with tf.name_scope('output'): 40 | logits = tf.matmul(rnn_drop, self.W) + self.b 41 | 42 | return logits 43 | 44 | def _bilstm_layer(self): 45 | fw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_num, state_is_tuple=True) 46 | bw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_num, state_is_tuple=True) 47 | 48 | with tf.name_scope("dropout"): 49 | fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=self.keep_prob) 50 | bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=self.keep_prob) 51 | 52 | outputs, _ = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, 53 | inputs=self.embedding_words, 54 | dtype=tf.float32) 55 | outputs = tf.concat(outputs, axis=2) 56 | output = tf.reduce_mean(outputs, axis=1) 57 | 58 | return output 59 | -------------------------------------------------------------------------------- /text_classification/models/TextCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from .BaseModel import TextClassifierBaseModel 8 | 9 | class TextCNN(TextClassifierBaseModel): 10 | def __init__(self, filter_sizes, num_filters, num_classes, sequence_length, 11 | w2v_model_embedding, vocab_size, embedding_size, 12 | keep_prob=0.5, 13 | initializer=tf.random_normal_initializer(stddev=0.1), 14 | l2_reg_lambda=0.001): 15 | super(TextCNN, self).__init__(num_classes=num_classes, sequence_length=sequence_length, 16 | w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size, 17 | initializer=tf.random_normal_initializer(stddev=0.1), 18 | l2_reg_lambda=0.001) 19 | 20 | self.filter_sizes = filter_sizes 21 | self.num_filters = num_filters 22 | self.num_filters_total = self.num_filters * len(self.filter_sizes) 23 | self.keep_prob = keep_prob 24 | 25 | self._initialize_embedding() 26 | self._initialize_weights() 27 | self.logits = self._inference() 28 | 29 | def _initialize_weights(self): 30 | with tf.name_scope('weights'): 31 | self.W = tf.get_variable(name='W', 32 | shape=[self.num_filters_total, self.num_classes], 33 | initializer=self.initializer) 34 | self.b = tf.get_variable(name='b', shape=[self.num_classes]) 35 | 36 | def _inference(self): 37 | self.embedding_words = tf.nn.embedding_lookup(self.Embedding, self.input_x) # [None, sequence_length, embedding_size] 38 | # [None, sequence_length, embedding_size, 1]. expand dimension so meet input requirement of 2d-conv 39 | self.sentence_embedding_expanded = tf.expand_dims(self.embedding_words, -1) 40 | 41 | conv_out = self._conv_layer() 42 | 43 | with tf.name_scope('output'): 44 | logits = tf.matmul(conv_out, self.W) + self.b 45 | return logits 46 | 47 | def _conv_layer(self): 48 | pooled_outputs = [] 49 | for i, filter_size in enumerate(self.filter_sizes): 50 | with tf.variable_scope('convolution-pooling-{}'.format(i)): 51 | filter = tf.get_variable(name='filter-{}'.format(filter_size), 52 | shape=[filter_size, self.embedding_size, 1, self.num_filters], 53 | initializer=self.initializer,) 54 | # Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]` 55 | # Conv.Returns: A `Tensor`. Has the same type as `input`. 56 | conv = tf.nn.conv2d(self.sentence_embedding_expanded, 57 | filter, 58 | strides=[1, 1, 1, 1], 59 | padding='VALID', 60 | name='conv') 61 | 62 | b = tf.get_variable(name='b-{}'.format(filter_size), shape=[self.num_filters]) 63 | h = tf.nn.relu(tf.nn.bias_add(conv, b), 'relu') 64 | 65 | pooled = tf.nn.max_pool(h, 66 | ksize=[1, self.sequence_length - filter_size + 1, 1, 1], 67 | strides=[1, 1, 1, 1], 68 | padding='VALID', 69 | name='pool') 70 | pooled_outputs.append(pooled) 71 | 72 | h_pool = tf.concat(pooled_outputs, 3) 73 | h_pool_flatten = tf.reshape(h_pool, [-1, self.num_filters_total]) 74 | 75 | with tf.name_scope('dropout'): 76 | h_drop = tf.nn.dropout(h_pool_flatten, self.keep_prob) 77 | 78 | return h_drop 79 | -------------------------------------------------------------------------------- /text_classification/models/TextRCNN.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/text_classification/models/TextRCNN.py -------------------------------------------------------------------------------- /text_classification/models/TextRNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from .BaseModel import TextClassifierBaseModel 7 | 8 | class TextRNN(TextClassifierBaseModel): 9 | def __init__(self, num_classes, sequence_length, 10 | w2v_model_embedding, vocab_size, embedding_size, 11 | hidden_num, hidden_size, keep_prob, 12 | initializer=tf.random_normal_initializer(stddev=0.1), 13 | l2_reg_lambda=0.001): 14 | super(TextRNN, self).__init__(num_classes=num_classes, sequence_length=sequence_length, 15 | w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size, 16 | initializer=tf.random_normal_initializer(stddev=0.1), 17 | l2_reg_lambda=0.001) 18 | 19 | self.hidden_num = hidden_num 20 | self.hidden_size = hidden_size 21 | self.keep_prob = keep_prob 22 | 23 | self._initialize_embedding() 24 | self._initialize_weights() 25 | self.logits = self._inference() 26 | 27 | def _initialize_weights(self): 28 | with tf.name_scope('weights'): 29 | self.W = tf.get_variable(name='W', 30 | shape=[self.hidden_size, self.num_classes], 31 | initializer=self.initializer) 32 | self.b = tf.get_variable(name='b', shape=[self.num_classes]) 33 | 34 | def _inference(self): 35 | 36 | self.embedding_words = tf.nn.embedding_lookup(self.Embedding, self.input_x) 37 | 38 | rnn_drop = self._rnn_layer() 39 | 40 | with tf.name_scope('output'): 41 | logits = tf.matmul(rnn_drop, self.W) + self.b 42 | 43 | return logits 44 | 45 | def _rnn_layer(self): 46 | cells = [] 47 | for _ in range(self.hidden_size): 48 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_num, state_is_tuple=True) 49 | lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob) 50 | cells.append(lstm_cell) 51 | cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True) 52 | 53 | outputs, _ = tf.nn.dynamic_rnn(cell, 54 | inputs=self.embedding_words, 55 | dtype=tf.float32) 56 | outputs = tf.concat(outputs, axis=2) 57 | output = tf.reduce_mean(outputs, axis=1) 58 | 59 | return output 60 | -------------------------------------------------------------------------------- /text_classification/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/text_classification/models/__init__.py -------------------------------------------------------------------------------- /text_classification/online/utils/ckpt2pb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import tensorflow as tf 5 | from tensorflow.python.framework import graph_util 6 | 7 | def freeze_graph(ckpt_model_dir, output_graph): 8 | ''' 9 | :param input_checkpoint: 10 | :param output_graph: PB模型保存路径 11 | :return: 12 | ''' 13 | checkpoint = tf.train.get_checkpoint_state(ckpt_model_dir)#检查目录下ckpt文件状态是否可用 14 | if not checkpoint: 15 | print('dir not') 16 | exit() 17 | input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 18 | 19 | # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 20 | # 直接用最后输出的节点,可以在tensorboard中查找到,tensorboard只能在linux中使用 21 | output_node_names = "output/add" 22 | saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) 23 | graph = tf.get_default_graph() # 获得默认的图 24 | 25 | input_graph_def = tf.get_default_graph().as_graph_def() 26 | 27 | node_names = [n.name for n in input_graph_def.node] 28 | for node in node_names: 29 | print(node) 30 | 31 | with tf.Session() as sess: 32 | saver.restore(sess, input_checkpoint) #恢复图并得到数据 33 | output_graph_def = graph_util.convert_variables_to_constants(sess=sess, # 模型持久化,将变量值固定 34 | input_graph_def=input_graph_def, # 等于:sess.graph_def 35 | output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 36 | 37 | with tf.gfile.GFile(output_graph, "wb") as f: #保存模型 38 | f.write(output_graph_def.SerializeToString()) #序列化输出 39 | print("%d ops in the final graph." % len(output_graph_def.node)) #得到当前图有几个操作节点 40 | 41 | input_checkpoint='text_classification/examples/model_save/checkpoints/' 42 | out_pb_path='text_classification/online/pb_model/' 43 | freeze_graph(input_checkpoint, out_pb_path) -------------------------------------------------------------------------------- /text_classification/online/utils/ckpt2save.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | import tensorflow as tf 6 | from tensorflow.python.framework import graph_util 7 | 8 | 9 | def freeze_graph(ckpt_model_dir, export_path_base, model_version): 10 | ''' 11 | :param input_checkpoint: 12 | :param output_graph: PB模型保存路径 13 | :return: 14 | ''' 15 | checkpoint = tf.train.get_checkpoint_state(ckpt_model_dir)#检查目录下ckpt文件状态是否可用 16 | if not checkpoint: 17 | print('dir not') 18 | exit() 19 | input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 20 | 21 | export_path = os.path.join(tf.compat.as_bytes(export_path_base), 22 | tf.compat.as_bytes(str(model_version))) 23 | print('Exporting trained model to', export_path) 24 | builder = tf.saved_model.builder.SavedModelBuilder(export_path) 25 | 26 | saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) 27 | 28 | # input_graph_def = tf.get_default_graph().as_graph_def() 29 | # node_names = [n.name for n in input_graph_def.node] 30 | # for node in node_names: 31 | # print(node) 32 | 33 | with tf.Session() as sess: 34 | saver.restore(sess, input_checkpoint) #恢复图并得到数据 35 | input_x = sess.graph.get_tensor_by_name('input_x:0') 36 | output = sess.graph.get_tensor_by_name('output/add:0') 37 | 38 | tensor_info_x = tf.saved_model.utils.build_tensor_info(input_x) # 输入 39 | tensor_info_y = tf.saved_model.utils.build_tensor_info(output) # 输出 40 | 41 | prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs={'x': tensor_info_x}, 42 | outputs={'y': tensor_info_y}, 43 | method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME 44 | ) 45 | 46 | legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') 47 | builder.add_meta_graph_and_variables(sess, 48 | [tf.saved_model.tag_constants.SERVING], 49 | signature_def_map={'predictions': prediction_signature}, 50 | legacy_init_op=legacy_init_op) 51 | 52 | builder.save() 53 | 54 | print('Done exporting!') 55 | 56 | input_checkpoint='text_classification/examples/model_save/checkpoints/' 57 | out_pb_path='text_classification/online/save_model/' 58 | freeze_graph(input_checkpoint, out_pb_path, 1) -------------------------------------------------------------------------------- /text_classification/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/text_classification/utils/__init__.py -------------------------------------------------------------------------------- /text_classification/utils/data_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | import pickle 6 | import gensim 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.model_selection import train_test_split 10 | from keras.preprocessing.text import Tokenizer 11 | from keras.preprocessing.sequence import pad_sequences 12 | from keras.utils import to_categorical 13 | 14 | def read_and_process_data(data_path, w2v_model, save_path): 15 | train_data = [] 16 | train_label = [] 17 | label_map = {'Positive': 1, 'Negative': 0} 18 | with open(data_path, 'r', encoding='utf-8') as f: 19 | for index, line in enumerate(f.readlines()): 20 | if index == 0: 21 | continue 22 | line = line.strip().split(',') 23 | if len(line) != 3: 24 | continue 25 | if line[-1] not in label_map: 26 | continue 27 | s_id, content, label = line 28 | train_data.append(content) 29 | train_label.append(label_map[label]) 30 | train_label = to_categorical(train_label, num_classes=2) 31 | 32 | train_x, valid_x, train_y, valid_y = train_test_split(train_data, train_label, test_size=0.15, random_state=2020) 33 | 34 | maxlen = max([len(c.split(' ')) for c in train_data]) 35 | 36 | ## Tokenize the sentences 37 | tokenizer = Tokenizer() 38 | tokenizer.fit_on_texts(train_data) 39 | word2index = tokenizer.word_index 40 | print(len(word2index)) 41 | embedding = generate_embedding(word2index, w2v_model) 42 | 43 | train_x = tokenizer.texts_to_sequences(train_x) 44 | train_x = pad_sequences(train_x, maxlen=maxlen) 45 | 46 | valid_x = tokenizer.texts_to_sequences(valid_x) 47 | valid_x = pad_sequences(valid_x, maxlen=maxlen) 48 | 49 | np.save(save_path + 'train_x.npy', train_x) 50 | np.save(save_path + 'train_y.npy', train_y) 51 | np.save(save_path + 'valid_x.npy', valid_x) 52 | np.save(save_path + 'valid_y.npy', valid_y) 53 | np.save(save_path + 'embedding.npy', embedding) 54 | 55 | pickle.dump(word2index, open(save_path + 'word2index.pkl', 'wb')) 56 | 57 | print('vocab size: {}'.format(len(word2index) + 1)) 58 | 59 | # maxlen = train_x.shape[1] 60 | # vocab_size = len(word2index) + 1 61 | # index2word = {v: k for k, v in word2index.items()} 62 | 63 | def generate_embedding(word2index, w2v_model): 64 | embedding = np.zeros((len(word2index) + 1, 200)) 65 | for word, index in word2index.items(): 66 | try: 67 | embedding[index] = w2v_model[word] 68 | except: 69 | continue 70 | return embedding 71 | 72 | def load_w2v_model(w2v_model_path): 73 | return gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=False) 74 | 75 | def data_process(content): 76 | pass 77 | 78 | def filter_stop_words(content, stop_words): 79 | pass 80 | 81 | def next_batch(train_x, train_y, batch_size, shuffle=True): 82 | data_size = len(train_x) 83 | num_batches_per_epoch = int(data_size / batch_size) + 1 84 | # while True: 85 | if shuffle: 86 | shuffle_indices = np.random.permutation(np.arange(data_size)) 87 | shuffled_data = train_x[shuffle_indices] 88 | shuffled_data_y = train_y[shuffle_indices] 89 | else: 90 | shuffled_data, shuffled_data_y = train_x, train_y 91 | 92 | for batch_num in range(num_batches_per_epoch): 93 | start_index = batch_num * batch_size 94 | end_index = min((batch_num + 1) * batch_size, data_size) 95 | 96 | yield shuffled_data[start_index:end_index], shuffled_data_y[start_index:end_index] 97 | 98 | def load_data(data_path): 99 | train_x = np.load(data_path + 'train_x.npy') 100 | train_y = np.load(data_path + 'train_y.npy') 101 | valid_x = np.load(data_path + 'valid_x.npy') 102 | valid_y = np.load(data_path + 'valid_y.npy') 103 | embedding = np.load(data_path + 'embedding.npy') 104 | 105 | word2index = pickle.load(open(data_path + 'word2index.pkl', 'rb')) 106 | index2word = {v: k for k, v in word2index.items()} 107 | vocab_size = len(word2index) + 1 108 | maxlen = len(train_x[0]) 109 | 110 | return train_x, train_y, valid_x, valid_y, embedding, word2index, index2word, vocab_size, maxlen 111 | 112 | if __name__ == "__main__": 113 | data_path = '../text_data/raw_data/train.csv' 114 | w2v_model_path = '../text_data/w2v_model/text_w2v_model.txt' 115 | w2v_model = load_w2v_model(w2v_model_path) 116 | read_and_process_data(data_path, w2v_model, '../text_data/input_data/') -------------------------------------------------------------------------------- /text_classification/utils/generate_w2v.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | import time 6 | import numpy as np 7 | import gensim 8 | 9 | def load_data(data_path): 10 | train_data = [] 11 | with open(data_path, 'r', encoding='utf-8') as f: 12 | for index, line in enumerate(f.readlines()): 13 | if index == 0: 14 | continue 15 | line = line.strip().split(',') 16 | if len(line) != 3: 17 | continue 18 | content = line[1] 19 | train_data.append(content.strip().split()) 20 | return train_data 21 | 22 | def train_w2v(train_data, model_path): 23 | start_time = time.time() 24 | model = gensim.models.Word2Vec(train_data, size=200, window=5, min_count=0, workers=3, iter=10) 25 | print('train done, time used {:.4f} min.'.format((time.time() - start_time) / 60)) 26 | print(len(model.wv.vocab)) 27 | model.wv.save_word2vec_format(model_path, binary=False) 28 | 29 | if __name__ == "__main__": 30 | train_data = load_data('../text_data/raw_data/train.csv') 31 | print(len(train_data)) 32 | print(train_data[:3]) 33 | 34 | train_w2v(train_data, '../text_data//w2v_model/text_w2v_model.txt') -------------------------------------------------------------------------------- /text_matching/esim/ESIM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import AutoTokenizer,AutoModel 4 | from model.SoftAttention import SoftmaxAttention 5 | 6 | 7 | ### BERT_ESIM 8 | ### ESIM部分来自 https://github.com/coetaur0/ESIM 项目 9 | class ESIM(nn.Module): 10 | 11 | def __init__(self,bert_path='/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/', 12 | in_size = 768, 13 | hidden_size = 300, 14 | out_size = 2, 15 | dropout = 0.5 16 | ): 17 | super().__init__() 18 | self.in_size = in_size 19 | self.hidden_size = hidden_size 20 | self.out_size = out_size 21 | self.dropout = dropout 22 | 23 | ## embedding 24 | self.embedding = AutoModel.from_pretrained(bert_path) 25 | for i,(name,para) in enumerate(self.embedding.named_parameters()): 26 | para.requires_grad = False ## 冻结作为词向量 27 | # print(i,name,para.requires_grad) 28 | 29 | ## ESIM 30 | self.encoding = nn.LSTM(self.in_size,self.hidden_size,bidirectional=True) 31 | self.attention = SoftmaxAttention() 32 | self.projection = nn.Sequential(nn.Linear(4*2*self.hidden_size,self.hidden_size), ## 4表示拼接的4个向量/2表示双向 33 | nn.ReLU()) 34 | self.composition = nn.LSTM(self.hidden_size,self.hidden_size,bidirectional=True) 35 | self.classifer = nn.Sequential(nn.Dropout(p=self.dropout), 36 | nn.Linear(2*4*self.hidden_size, 37 | self.hidden_size), 38 | nn.Tanh(), 39 | nn.Dropout(p=self.dropout), 40 | nn.Linear(self.hidden_size, 41 | self.out_size)) 42 | 43 | def forward(self, 44 | premises, 45 | premise_mask, 46 | premise_seg_ids, 47 | hypotheses, 48 | hypotheses_mask, 49 | hypo_seg_ids, 50 | ): 51 | embedded_premises = self.embedding(premises,premise_mask,premise_seg_ids) 52 | embedded_premises = embedded_premises[0] 53 | # print(embedded_premises) 54 | # print(type(embedded_premises)) 55 | # print(embedded_premises) 56 | embedded_hypotheses = self.embedding(hypotheses,hypotheses_mask,hypo_seg_ids) 57 | embedded_hypotheses = embedded_hypotheses[0] 58 | # print(type(embedded_premises)) 59 | encoded_premises,_ = self.encoding(embedded_premises) 60 | encoded_hypotheses,_ = self.encoding(embedded_hypotheses) 61 | # print(type(encoded_premises)) 62 | # print(len(encoded_premises[0])) 63 | 64 | 65 | attended_premises, attended_hypotheses =\ 66 | self.attention(encoded_premises, premise_mask, 67 | encoded_hypotheses, hypotheses_mask) 68 | 69 | enhanced_premises = torch.cat([encoded_premises, 70 | attended_premises, 71 | encoded_premises - attended_premises, 72 | encoded_premises * attended_premises], 73 | dim=-1) 74 | enhanced_hypotheses = torch.cat([encoded_hypotheses, 75 | attended_hypotheses, 76 | encoded_hypotheses - attended_hypotheses, 77 | encoded_hypotheses * attended_hypotheses], 78 | dim=-1) 79 | projected_premises = self.projection(enhanced_premises) 80 | projected_hypotheses = self.projection(enhanced_hypotheses) 81 | 82 | v_ai,_ = self.composition(projected_premises) 83 | v_bj,_ = self.composition(projected_hypotheses) 84 | 85 | # print(v_ai) 86 | # print('--------') 87 | # print(premise_mask) 88 | # print('--------') 89 | # print(v_ai.size()) 90 | # print(premise_mask.size()) 91 | # print(torch.sum(v_ai * premise_mask.unsqueeze(1).transpose(2, 1), dim=1)) 92 | v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(1) 93 | .transpose(2, 1), dim=1)\ 94 | / torch.sum(premise_mask, dim=1, keepdim=True) 95 | v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) 96 | .transpose(2, 1), dim=1)\ 97 | / torch.sum(hypotheses_mask, dim=1, keepdim=True) 98 | 99 | # v_a_max, _ = replace_masked(v_ai, premise_mask, -1e7).max(dim=1) 100 | # v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) 101 | v_a_max,_ = v_ai.max(dim=1) 102 | v_b_max,_ = v_bj.max(dim=1) 103 | 104 | v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) 105 | 106 | logits = self.classifer(v) 107 | probabilities = nn.functional.softmax(logits, dim=-1) 108 | 109 | return logits, probabilities 110 | 111 | 112 | if __name__=='__main__': 113 | tokenizer = AutoTokenizer.from_pretrained('/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/') 114 | a = tokenizer(['我喜欢北京']) 115 | input_ids = a['input_ids'] 116 | seg_ids = a['token_type_ids'] 117 | atten_mask = a['attention_mask'] 118 | 119 | b = tokenizer(['我爱北京']) 120 | binput_ids = a['input_ids'] 121 | bseg_ids = a['token_type_ids'] 122 | batten_mask = a['attention_mask'] 123 | # seg_ids 124 | # print(type(a)) 125 | # print(a) 126 | emodel = ESIM() 127 | logits,p = emodel(torch.tensor(input_ids),torch.tensor(seg_ids),torch.tensor(atten_mask), 128 | torch.tensor(binput_ids),torch.tensor(bseg_ids),torch.tensor(batten_mask)) 129 | 130 | print(logits) 131 | print(p) 132 | -------------------------------------------------------------------------------- /text_matching/esim/SoftAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class SoftmaxAttention(nn.Module): 6 | """ 7 | Attention layer taking premises and hypotheses encoded by an RNN as input 8 | and computing the soft attention between their elements. 9 | 10 | The dot product of the encoded vectors in the premises and hypotheses is 11 | first computed. The softmax of the result is then used in a weighted sum 12 | of the vectors of the premises for each element of the hypotheses, and 13 | conversely for the elements of the premises. 14 | """ 15 | 16 | def forward(self, 17 | premise_batch, 18 | premise_mask, 19 | hypothesis_batch, 20 | hypothesis_mask): 21 | """ 22 | Args: 23 | premise_batch: A batch of sequences of vectors representing the 24 | premises in some NLI task. The batch is assumed to have the 25 | size (batch, sequences, vector_dim). 26 | premise_mask: A mask for the sequences in the premise batch, to 27 | ignore padding data in the sequences during the computation of 28 | the attention. 29 | hypothesis_batch: A batch of sequences of vectors representing the 30 | hypotheses in some NLI task. The batch is assumed to have the 31 | size (batch, sequences, vector_dim). 32 | hypothesis_mask: A mask for the sequences in the hypotheses batch, 33 | to ignore padding data in the sequences during the computation 34 | of the attention. 35 | 36 | Returns: 37 | attended_premises: The sequences of attention vectors for the 38 | premises in the input batch. 39 | attended_hypotheses: The sequences of attention vectors for the 40 | hypotheses in the input batch. 41 | """ 42 | # Dot product between premises and hypotheses in each sequence of 43 | # the batch. 44 | similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1) 45 | .contiguous()) 46 | 47 | # Softmax attention weights. 48 | prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask) 49 | hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2) 50 | .contiguous(), 51 | premise_mask) 52 | 53 | # Weighted sums of the hypotheses for the the premises attention, 54 | # and vice-versa for the attention of the hypotheses. 55 | attended_premises = weighted_sum(hypothesis_batch, 56 | prem_hyp_attn, 57 | premise_mask) 58 | attended_hypotheses = weighted_sum(premise_batch, 59 | hyp_prem_attn, 60 | hypothesis_mask) 61 | 62 | return attended_premises, attended_hypotheses 63 | 64 | 65 | # Code widely inspired from: 66 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. 67 | def masked_softmax(tensor, mask): 68 | """ 69 | Apply a masked softmax on the last dimension of a tensor. 70 | The input tensor and mask should be of size (batch, *, sequence_length). ## 这与常规的算法不同啊 71 | 72 | Args: 73 | tensor: The tensor on which the softmax function must be applied along 74 | the last dimension. 75 | mask: A mask of the same size as the tensor with 0s in the positions of 76 | the values that must be masked and 1s everywhere else. 77 | 78 | Returns: 79 | A tensor of the same size as the inputs containing the result of the 80 | softmax. 81 | """ 82 | tensor_shape = tensor.size() 83 | reshaped_tensor = tensor.view(-1, tensor_shape[-1]) 84 | 85 | # Reshape the mask so it matches the size of the input tensor. 86 | while mask.dim() < tensor.dim(): 87 | mask = mask.unsqueeze(1) 88 | mask = mask.expand_as(tensor).contiguous().float() 89 | reshaped_mask = mask.view(-1, mask.size()[-1]) 90 | 91 | result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1) 92 | result = result * reshaped_mask 93 | # 1e-13 is added to avoid divisions by zero. 94 | result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) 95 | 96 | return result.view(*tensor_shape) 97 | 98 | ### copy from ESIM 99 | # Code widely inspired from: 100 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. 101 | def weighted_sum(tensor, weights, mask): 102 | """ 103 | Apply a weighted sum on the vectors along the last dimension of 'tensor', 104 | and mask the vectors in the result with 'mask'. 105 | 106 | Args: 107 | tensor: A tensor of vectors on which a weighted sum must be applied. 108 | weights: The weights to use in the weighted sum. 109 | mask: A mask to apply on the result of the weighted sum. 110 | 111 | Returns: 112 | A new tensor containing the result of the weighted sum after the mask 113 | has been applied on it. 114 | """ 115 | weighted_sum = weights.bmm(tensor) 116 | 117 | while mask.dim() < weighted_sum.dim(): 118 | mask = mask.unsqueeze(1) 119 | mask = mask.transpose(-1, -2) 120 | mask = mask.expand_as(weighted_sum).contiguous().float() 121 | 122 | return weighted_sum * mask 123 | 124 | 125 | # a = torch.rand((2,2,3)) 126 | # print(a) 127 | # b = torch.ones((2,2)) 128 | # # b[0][0][2] = 0 129 | # b[0][1] = 0 130 | # print(b) 131 | 132 | # atten = SoftmaxAttention() 133 | # ## batch,*,seq_len 134 | # # a = a.transpose(1,2) 135 | # # b = b.transpose() 136 | # t1,t2 = atten(a,b,a,b) 137 | 138 | # print(t1.size()) 139 | # print(t1) 140 | # print(t2.size()) 141 | # print(t2) -------------------------------------------------------------------------------- /text_matching/sentence-bert/SBERT.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoTokenizer,AutoModel 5 | 6 | class SBERT(nn.Module): 7 | 8 | def __init__(self, 9 | bert_path='/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/' 10 | ): 11 | super().__init__() 12 | 13 | ## embedding 14 | self.embedding = AutoModel.from_pretrained(bert_path) 15 | # for i,(name,para) in enumerate(self.embedding.named_parameters()): 16 | # para.requires_grad = False ## 冻结作为词向量 17 | # print(i,name,para.requires_grad) 18 | self.metric = nn.CosineSimilarity(dim=1, eps=1e-6) 19 | self.hidden_size = 768 20 | 21 | self.fc = nn.Linear(self.hidden_size * 3, 2) 22 | 23 | def forward(self, 24 | premises, 25 | premise_mask, 26 | premise_seg_ids, 27 | hypotheses, 28 | hypotheses_mask, 29 | hypo_seg_ids, 30 | inference=False 31 | ): 32 | embedded_premises = self.embedding(premises,premise_mask,premise_seg_ids) 33 | embedded_premises = embedded_premises[0] 34 | 35 | embedded_hypotheses = self.embedding(hypotheses,hypotheses_mask,hypo_seg_ids) 36 | embedded_hypotheses = embedded_hypotheses[0] 37 | 38 | sen_a_len, sen_b_len = (premise_mask != 0).sum(dim=1, keepdim=True), (hypotheses_mask != 0).sum(dim=1, keepdim=True) 39 | sen_a_pooling, sen_b_pooling = embedded_premises.sum(dim=1) / sen_a_len, embedded_hypotheses.sum(dim=1) / sen_b_len 40 | 41 | if inference: 42 | # sen_a_norm = torch.norm(sen_a_pooling, dim=1) 43 | # sen_b_norm = torch.norm(sen_b_pooling, dim=1) 44 | # similarity = (sen_a_pooling * sen_b_pooling).sum(dim=1) / (sen_a_norm * sen_b_norm) 45 | similarity = F.cosine_similarity(sen_a_pooling, sen_b_pooling, dim=1) 46 | return similarity 47 | 48 | hidden = torch.cat([sen_a_pooling, sen_b_pooling, torch.abs(sen_a_pooling - sen_b_pooling)], dim=1) 49 | 50 | return self.fc(hidden) 51 | 52 | 53 | if __name__=='__main__': 54 | tokenizer = AutoTokenizer.from_pretrained('/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/') 55 | a = tokenizer(['我喜欢北京']) 56 | input_ids = a['input_ids'] 57 | seg_ids = a['token_type_ids'] 58 | atten_mask = a['attention_mask'] 59 | 60 | b = tokenizer(['另外一个不相关的句子']) 61 | binput_ids = a['input_ids'] 62 | bseg_ids = a['token_type_ids'] 63 | batten_mask = a['attention_mask'] 64 | # seg_ids 65 | # print(type(a)) 66 | # print(a) 67 | emodel = SBERT() 68 | logits = emodel(torch.tensor(input_ids),torch.tensor(atten_mask),torch.tensor(seg_ids), 69 | torch.tensor(binput_ids),torch.tensor(batten_mask),torch.tensor(bseg_ids) 70 | ) 71 | 72 | print(logits) 73 | -------------------------------------------------------------------------------- /tiny_transformer/configuration.py: -------------------------------------------------------------------------------- 1 | class ModelConfig(): 2 | 3 | model_type = "transformer" 4 | 5 | def __init__( 6 | self, 7 | vocab_size=20, 8 | max_position_embeddings=20, 9 | encoder_layer_nums=6, 10 | decoder_layer_nums=6, 11 | num_attention_heads=8, 12 | hidden_size=512, 13 | intermediate_size=1024, 14 | hidden_dropout_prob=0.1, 15 | attention_probs_dropout_prob=0.1, 16 | type_vocab_size=2, 17 | initializer_range=0.02, 18 | layer_norm_eps=1e-6, 19 | pad_token_id=0, 20 | **kwargs, 21 | ): 22 | super().__init__(**kwargs) 23 | 24 | self.vocab_size = vocab_size 25 | self.max_position_embeddings = max_position_embeddings 26 | self.encoder_layer_nums = encoder_layer_nums 27 | self.decoder_layer_nums = decoder_layer_nums 28 | self.num_attention_heads = num_attention_heads 29 | self.hidden_size = hidden_size 30 | self.intermediate_size = intermediate_size 31 | self.hidden_dropout_prob = hidden_dropout_prob 32 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 33 | self.type_vocab_size = type_vocab_size 34 | self.initializer_range = initializer_range 35 | self.layer_norm_eps = layer_norm_eps 36 | self.pad_token_id = pad_token_id 37 | -------------------------------------------------------------------------------- /tiny_transformer/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import random 4 | 5 | import numpy as np 6 | import torch 7 | from torch.utils.data import Dataset, DataLoader 8 | 9 | from train_data import sentence_pairs, vocab2id, Tokenizer 10 | from utils import subsequent_mask 11 | 12 | tok = Tokenizer(vocab2id=vocab2id) 13 | 14 | class TrainDatasets(Dataset): 15 | def __init__(self) -> None: 16 | self.sentence_pairs = sentence_pairs 17 | self.train_data = [] 18 | for src, tgt in self.sentence_pairs: 19 | self.train_data.append([tok.encode(src), tok.encode(tgt)]) 20 | 21 | def __len__(self): 22 | return len(self.sentence_pairs) 23 | 24 | def __getitem__(self, i): 25 | return self.train_data[i] 26 | 27 | 28 | def make_std_mask(tgt, pad): 29 | "Create a mask to hide padding and future words." 30 | tgt_mask = (tgt != pad).unsqueeze(-2) 31 | tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as( 32 | tgt_mask.data 33 | ) 34 | return tgt_mask 35 | 36 | def data_collator(batch, max_length=20): 37 | src = [] 38 | tgt = [] 39 | s_max_len = max([len(i[0]) for i in batch]) 40 | t_max_len = max([len(i[1]) for i in batch]) 41 | for s, t in batch: 42 | s = s + [tok.padding_token_id] * (s_max_len - len(s)) 43 | src.append(s) 44 | 45 | t = t + [tok.padding_token_id] * (t_max_len - len(t)) 46 | tgt.append(t) 47 | 48 | src = torch.LongTensor(src) 49 | src_mask = (src != tok.padding_token_id).unsqueeze(-2) 50 | tgt = torch.LongTensor(tgt) 51 | tgt = tgt[:, :-1] 52 | label = tgt[:, 1:] 53 | tgt_mask = make_std_mask(tgt, tok.padding_token_id) 54 | 55 | return { 56 | "src": src, 57 | "tgt": tgt, 58 | "src_mask": src_mask, 59 | "tgt_mask": tgt_mask, 60 | "label": label 61 | } 62 | 63 | 64 | if __name__ == "__main__": 65 | train_dataset = TrainDatasets() 66 | train_dataloader = DataLoader( 67 | dataset=train_dataset, 68 | collate_fn=data_collator, 69 | batch_size=2 70 | ) 71 | 72 | for batch in train_dataloader: 73 | print(batch) 74 | break -------------------------------------------------------------------------------- /tiny_transformer/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import copy 5 | import math 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from configuration import ModelConfig 10 | 11 | 12 | def clones(module, N): 13 | "Produce N identical layers." 14 | return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) 15 | 16 | 17 | """ 18 | 六大组件: 19 | 单词嵌入、位置编码、多头注意力、前馈网络、层归一化、残差连接、 20 | 三个中间件: 21 | 编码器、解码器、生成器 22 | Transformer = 编码器 + 解码器 + 生成器 23 | """ 24 | 25 | 26 | class Embedding(nn.Module): 27 | def __init__(self, vocab_size, d_model) -> None: 28 | super(Embedding, self).__init__() 29 | self.embedding = nn.Embedding(vocab_size, d_model) 30 | self.d_model = d_model 31 | 32 | def forward(self, x): 33 | return self.embedding(x) * math.sqrt(self.d_model) 34 | 35 | 36 | class PositionalEncoding(nn.Module): 37 | def __init__(self, d_model, dropout=0.1, max_len=5000) -> None: 38 | super(PositionalEncoding, self).__init__() 39 | self.dropout = nn.Dropout(dropout) 40 | 41 | pe = torch.zeros(max_len, d_model) 42 | position = torch.arange(0, max_len).unsqueeze(1) 43 | div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) 44 | 45 | pe[:, 0::2] = torch.sin(position * div_term) 46 | pe[:, 1::2] = torch.cos(position * div_term) 47 | pe = pe.unsqueeze(0) 48 | self.register_buffer('pe', pe) 49 | 50 | def forward(self, x): 51 | x = x + self.pe[:, :x.size(1)].requires_grad_(False) 52 | return self.dropout(x) 53 | 54 | 55 | class LayerNorm(nn.Module): 56 | def __init__(self, hidden_size, eps=1e-6) -> None: 57 | super(LayerNorm, self).__init__() 58 | self.a_2 = nn.Parameter(torch.ones(hidden_size)) 59 | self.b_2 = nn.Parameter(torch.zeros(hidden_size)) 60 | self.eps = eps 61 | 62 | def forward(self, x): 63 | mean = x.mean(-1, keepdim=True) 64 | std = x.std(-1, keepdim=True) 65 | return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 66 | 67 | 68 | class ScaleDotProductAttention(nn.Module): 69 | def __init__(self) -> None: 70 | super(ScaleDotProductAttention, self).__init__() 71 | 72 | def forward(self, query, key, value, mask=None, dropout=None): 73 | d_k = query.size(-1) 74 | scores =torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) 75 | if mask is not None: 76 | scores = scores.masked_fill(mask == 0, -1e9) 77 | p_attn = F.softmax(scores, dim=-1) 78 | if dropout is not None: 79 | p_attn = dropout(p_attn) 80 | return torch.matmul(p_attn, value), p_attn 81 | 82 | 83 | class MultiHeadAttention(nn.Module): 84 | def __init__(self, config: ModelConfig) -> None: 85 | super(MultiHeadAttention, self).__init__() 86 | self.d_model = config.hidden_size 87 | self.h = config.num_attention_heads 88 | 89 | assert self.d_model % self.h == 0 90 | self.d_k = self.d_model // self.h 91 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob) 92 | self.attn = None 93 | self.attention = ScaleDotProductAttention() 94 | 95 | def forward(self, query, key, value, mask=None): 96 | if mask is not None: 97 | mask = mask.unsqueeze(1) 98 | nbatches = query.size(0) 99 | 100 | # 1) Do all the linear projections in batch from d_model => h x d_k 101 | query = nn.Linear(self.d_model, self.d_model)(query).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) 102 | key = nn.Linear(self.d_model, self.d_model)(key).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) 103 | value = nn.Linear(self.d_model, self.d_model)(value).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) 104 | 105 | x, self.attn = self.attention(query, key, value, mask=mask, dropout=self.dropout) 106 | 107 | x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) 108 | 109 | del query, key, value 110 | 111 | return nn.Linear(self.d_model, self.d_model)(x) 112 | 113 | 114 | class PositionwiseFeedForward(nn.Module): 115 | def __init__(self, config: ModelConfig) -> None: 116 | super(PositionwiseFeedForward, self).__init__() 117 | self.w_1 = nn.Linear(config.hidden_size, config.intermediate_size) 118 | self.w_2 = nn.Linear(config.intermediate_size, config.hidden_size) 119 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 120 | 121 | def forward(self, x): 122 | return self.w_2(self.dropout(F.relu(self.w_1(x)))) 123 | 124 | 125 | class ResidualConnection(nn.Module): 126 | def __init__(self, config: ModelConfig) -> None: 127 | super(ResidualConnection, self).__init__() 128 | self.norm = LayerNorm(config.hidden_size) 129 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 130 | 131 | def forward(self, x, sublayer): 132 | return x + self.dropout(sublayer(self.norm(x))) 133 | 134 | 135 | class EncoderLayer(nn.Module): 136 | def __init__(self, config: ModelConfig) -> None: 137 | super(EncoderLayer, self).__init__() 138 | self.self_attn = MultiHeadAttention(config) 139 | self.feed_forward = PositionwiseFeedForward(config) 140 | self.sublayer = clones(ResidualConnection(config), 2) 141 | 142 | def forward(self, x, mask): 143 | x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) 144 | return self.sublayer[1](x, self.feed_forward) 145 | 146 | 147 | class Encoder(nn.Module): 148 | def __init__(self, config: ModelConfig) -> None: 149 | super(Encoder, self).__init__() 150 | self.encoder_layer = EncoderLayer(config) 151 | self.layers = clones(self.encoder_layer, config.encoder_layer_nums) 152 | self.norm = LayerNorm(config.hidden_size) 153 | 154 | def forward(self, x, mask): 155 | for layer in self.layers: 156 | x = layer(x, mask) 157 | return self.norm(x) 158 | 159 | 160 | class DecoderLayer(nn.Module): 161 | def __init__(self, config: ModelConfig) -> None: 162 | super(DecoderLayer, self).__init__() 163 | self.self_attn = MultiHeadAttention(config) 164 | self.src_attn = MultiHeadAttention(config) 165 | self.feed_forward = PositionwiseFeedForward(config) 166 | self.sublayer = clones(ResidualConnection(config), 3) 167 | 168 | def forward(self, x, memory, src_mask, tgt_mask): 169 | m = memory 170 | x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask)) 171 | x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask)) 172 | return self.sublayer[2](x, self.feed_forward) 173 | 174 | 175 | class Decoder(nn.Module): 176 | def __init__(self, config: ModelConfig) -> None: 177 | super(Decoder, self).__init__() 178 | self.decoder_layer = DecoderLayer(config) 179 | self.layers = clones(self.decoder_layer, config.decoder_layer_nums) 180 | self.norm = LayerNorm(config.hidden_size) 181 | 182 | def forward(self, x, memory, src_mask, tgt_mask): 183 | for layer in self.layers: 184 | x = layer(x, memory, src_mask, tgt_mask) 185 | return self.norm(x) 186 | 187 | 188 | class Generator(nn.Module): 189 | def __init__(self, config: ModelConfig) -> None: 190 | super(Generator, self).__init__() 191 | self.proj = nn.Linear(config.hidden_size, config.vocab_size) 192 | 193 | def forward(self, x): 194 | return F.log_softmax(self.proj(x), dim=-1) 195 | 196 | 197 | class Transformer(nn.Module): 198 | def __init__(self, config) -> None: 199 | super(Transformer, self).__init__() 200 | self.encoder = Encoder(config) 201 | self.decoder = Decoder(config) 202 | self.src_embed = nn.Sequential( 203 | Embedding(config.vocab_size, config.hidden_size), 204 | PositionalEncoding(config.hidden_size, max_len=config.max_position_embeddings) 205 | ) 206 | self.tgt_embed = nn.Sequential( 207 | Embedding(config.vocab_size, config.hidden_size), 208 | PositionalEncoding(config.hidden_size, max_len=config.max_position_embeddings) 209 | ) 210 | self.generator = Generator(config) 211 | 212 | for p in self.parameters(): 213 | if p.dim() > 1: 214 | torch.nn.init.xavier_uniform_(p) 215 | 216 | def forward(self, src, tgt, src_mask, tgt_mask, label=None): 217 | return self.generator(self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)) 218 | 219 | def encode(self, src, src_mask): 220 | return self.encoder(self.src_embed(src), src_mask) 221 | 222 | def decode(self, memory, src_mask, tgt, tgt_mask): 223 | return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask) 224 | -------------------------------------------------------------------------------- /tiny_transformer/test_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from configuration import ModelConfig 4 | from model import Transformer 5 | 6 | from utils import subsequent_mask 7 | 8 | config = ModelConfig(encoder_layer_nums=2, decoder_layer_nums=2) 9 | print(config.__dict__) 10 | 11 | 12 | def build_model(): 13 | model = Transformer(config) 14 | # This was important from their code. 15 | # Initialize parameters with Glorot / fan_avg. 16 | for p in model.parameters(): 17 | if p.dim() > 1: 18 | torch.nn.init.xavier_uniform_(p) 19 | 20 | return model 21 | 22 | 23 | def inference_test(): 24 | test_model = build_model() 25 | test_model.eval() 26 | src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) 27 | src_mask = torch.ones(1, 1, 10) 28 | memory = test_model.encode(src, src_mask) 29 | ys = torch.zeros(1, 1).type_as(src) 30 | for i in range(9): 31 | out = test_model.decode( 32 | memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data) 33 | ) 34 | prob = test_model.generator(out[:, -1]) 35 | _, next_word = torch.max(prob, dim=1) 36 | next_word = next_word.data[0] 37 | ys = torch.cat( 38 | [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1 39 | ) 40 | print("Example Untrained Model Prediction:", ys) 41 | 42 | 43 | def run_tests(): 44 | for _ in range(10): 45 | inference_test() 46 | 47 | 48 | run_tests() 49 | -------------------------------------------------------------------------------- /tiny_transformer/tests/dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | from torch.utils.data import Dataset,DataLoader 6 | 7 | # 定义字典 8 | words_x = ',1,2,3,4,5,6,7,8,9,0,,,+' 9 | vocab_x = {word: i for i, word in enumerate(words_x.split(','))} 10 | vocab_xr = [k for k, v in vocab_x.items()] #反查词典 11 | 12 | words_y = ',1,2,3,4,5,6,7,8,9,0,,' 13 | vocab_y = {word: i for i, word in enumerate(words_y.split(','))} 14 | vocab_yr = [k for k, v in vocab_y.items()] #反查词典 15 | 16 | #两数相加数据集 17 | def get_data(): 18 | # 定义词集合 19 | words = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] 20 | 21 | # 每个词被选中的概率 22 | p = np.array([7, 5, 5, 7, 6, 5, 7, 6, 5, 7]) 23 | p = p / p.sum() 24 | 25 | # 随机采样n1个词作为s1 26 | n1 = random.randint(10, 20) 27 | s1 = np.random.choice(words, size=n1, replace=True, p=p) 28 | s1 = s1.tolist() 29 | 30 | # 随机采样n2个词作为s2 31 | n2 = random.randint(10, 20) 32 | s2 = np.random.choice(words, size=n2, replace=True, p=p) 33 | s2 = s2.tolist() 34 | 35 | # x等于s1和s2字符上的相加 36 | x = s1 + ['+'] + s2 37 | 38 | # y等于s1和s2数值上的相加 39 | y = int(''.join(s1)) + int(''.join(s2)) 40 | y = list(str(y)) 41 | 42 | # 加上首尾符号 43 | x = [''] + x + [''] 44 | y = [''] + y + [''] 45 | 46 | # 补pad到固定长度 47 | x = x + [''] * 50 48 | y = y + [''] * 51 49 | x = x[:50] 50 | y = y[:51] 51 | 52 | # 编码成token 53 | token_x = [vocab_x[i] for i in x] 54 | token_y = [vocab_y[i] for i in y] 55 | 56 | # 转tensor 57 | tensor_x = torch.LongTensor(token_x) 58 | tensor_y = torch.LongTensor(token_y) 59 | return tensor_x, tensor_y 60 | 61 | 62 | def show_data(tensor_x,tensor_y) ->"str": 63 | words_x = "".join([vocab_xr[i] for i in tensor_x.tolist()]) 64 | words_y = "".join([vocab_yr[i] for i in tensor_y.tolist()]) 65 | return words_x,words_y 66 | 67 | 68 | # 定义数据集 69 | class TwoSumDataset(torch.utils.data.Dataset): 70 | def __init__(self,size = 100000): 71 | super(Dataset, self).__init__() 72 | self.size = size 73 | 74 | def __len__(self): 75 | return self.size 76 | 77 | def __getitem__(self, i): 78 | return get_data() 79 | 80 | ds_train = TwoSumDataset(size = 100000) 81 | ds_val = TwoSumDataset(size = 10000) 82 | 83 | 84 | if __name__ == "__main__": 85 | 86 | x,y = get_data() 87 | print(x,y,"\n") 88 | print(show_data(x,y)) 89 | 90 | # 数据加载器 91 | dl_train = DataLoader(dataset=ds_train, 92 | batch_size=200, 93 | drop_last=True, 94 | shuffle=True) 95 | 96 | dl_val = DataLoader(dataset=ds_val, 97 | batch_size=200, 98 | drop_last=True, 99 | shuffle=False) 100 | 101 | for src,tgt in dl_train: 102 | print(src.shape) 103 | print(tgt.shape) 104 | break 105 | -------------------------------------------------------------------------------- /tiny_transformer/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | from torch.optim.lr_scheduler import LambdaLR 5 | 6 | from configuration import ModelConfig 7 | from model import Transformer 8 | 9 | from trainer import Trainer, TrainerArgs 10 | 11 | from dataset import tok, TrainDatasets, data_collator 12 | 13 | config = ModelConfig() 14 | config.vocab_size = tok.vocab_size 15 | print(config.vocab_size) 16 | model = Transformer(config=config) 17 | optimizer = torch.optim.Adam( 18 | model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9 19 | ) 20 | def rate(step, model_size, factor, warmup): 21 | """ 22 | we have to default the step to 1 for LambdaLR function 23 | to avoid zero raising to negative power. 24 | """ 25 | if step == 0: 26 | step = 1 27 | return factor * ( 28 | model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5)) 29 | ) 30 | lr_scheduler = LambdaLR( 31 | optimizer=optimizer, 32 | lr_lambda=lambda step: rate( 33 | step, model_size=model.src_embed[0].d_model, factor=1.0, warmup=400 34 | ), 35 | ) 36 | 37 | args = TrainerArgs() 38 | trainer = Trainer( 39 | model=model, 40 | tokenizer=tok, 41 | args=args, 42 | data_collator=data_collator, 43 | train_dataset=TrainDatasets(), 44 | optimizers=(optimizer, lr_scheduler)) 45 | trainer.train() -------------------------------------------------------------------------------- /tiny_transformer/train_data.py: -------------------------------------------------------------------------------- 1 | # 批量翻译 2 | sentence_pairs = [ 3 | ['je pars en vacances pour quelques jours .', 'i m taking a couple of days off .'], 4 | ['je ne me panique pas .', 'i m not panicking .'], 5 | ['je recherche un assistant .', 'i am looking for an assistant .'], 6 | ['je suis loin de chez moi .', 'i m a long way from home .'], 7 | ['vous etes en retard .', 'you re very late .'], 8 | ['j ai soif .', 'i am thirsty .'], 9 | ['je suis fou de vous .', 'i m crazy about you .'], 10 | ['vous etes vilain .', 'you are naughty .'], 11 | ['il est vieux et laid .', 'he s old and ugly .'], 12 | ['je suis terrifiee .', 'i m terrified .'], 13 | ] 14 | 15 | import numpy as np 16 | test_data_list = [ 17 | " ".join([str(i) for i in list(np.random.randint(10,size=10))]) 18 | for _ in range(20) 19 | ] 20 | sentence_pairs = [ 21 | [i, i] for i in test_data_list 22 | ] 23 | 24 | all_words = [] 25 | for x, y in sentence_pairs: 26 | all_words.extend(x.split()) 27 | all_words.extend(y.split()) 28 | all_words = sorted(list(set(all_words))) 29 | vocab2id = {word: i for i, word in enumerate(all_words)} 30 | id2vocab = {i: word for word, i in vocab2id.items()} 31 | print(vocab2id) 32 | 33 | class Tokenizer(): 34 | def __init__(self, vocab2id) -> None: 35 | self.vocab_size = len(vocab2id) + 1 36 | self.padding_token_id = len(vocab2id) 37 | self.vocab2id = vocab2id 38 | self.id2vocab = {i: word for word, i in self.vocab2id.items()} 39 | 40 | def encode(self, sentences): 41 | ids = [self.vocab2id[w] for w in sentences.split()] 42 | return ids 43 | 44 | def decode(self, ids): 45 | return " ".join([self.id2vocab[i] for i in ids]) 46 | 47 | if __name__ == "__main__": 48 | tok = Tokenizer(vocab2id=vocab2id) 49 | s = sentence_pairs[0][0] 50 | print(s) 51 | ids = tok.encode(s) 52 | print(ids) 53 | print(tok.decode(ids)) -------------------------------------------------------------------------------- /tiny_transformer/trainer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import copy 4 | import functools 5 | import os 6 | import sys 7 | import random 8 | import time 9 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union 10 | 11 | import numpy as np 12 | import torch 13 | import torch.nn as nn 14 | from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler 15 | 16 | 17 | 18 | class LabelSmoothingLoss(nn.Module): 19 | "Implement label smoothing." 20 | def __init__(self, size, padding_idx, smoothing=0.0): 21 | super(LabelSmoothingLoss, self).__init__() 22 | self.criterion = nn.KLDivLoss(reduction="sum") 23 | self.padding_idx = padding_idx 24 | self.confidence = 1.0 - smoothing 25 | self.smoothing = smoothing 26 | self.size = size 27 | self.true_dist = None 28 | 29 | def forward(self, x, target): 30 | assert x.size(1) == self.size 31 | true_dist = x.data.clone() 32 | true_dist.fill_(self.smoothing / (self.size - 2)) 33 | true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 34 | true_dist[:, self.padding_idx] = 0 35 | mask = torch.nonzero((target.data == self.padding_idx).int()) 36 | if mask.dim() > 0: 37 | true_dist.index_fill_(0, mask.squeeze(), 0.0) 38 | self.true_dist = true_dist 39 | return self.criterion(x, true_dist) 40 | 41 | 42 | class TrainerArgs: 43 | epochs: int = 10 44 | learning_rate: float = 1e-4 45 | train_batch_size: int = 2 46 | eval_batch_size: int = 2 47 | gradient_accumulation_steps: int = 1 48 | evaluation_steps: int = 1000 49 | logging_steps: int = 1 50 | save_steps: int = 1000 51 | device: str = "cuda" if torch.cuda.is_available() else "cpu" 52 | 53 | 54 | class Trainer: 55 | 56 | def __init__( 57 | self, 58 | model, 59 | tokenizer, 60 | args: TrainerArgs, 61 | data_collator = None, 62 | train_dataset: Optional[Union[Dataset, IterableDataset]] = None, 63 | eval_dataset: Optional[Union[Dataset, IterableDataset]] = None, 64 | loss_fn: nn.Module = None, 65 | optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None) 66 | ): 67 | self.model = model 68 | self.args = args 69 | self.data_collator = data_collator 70 | self.train_dataset = train_dataset 71 | self.eval_dataset = eval_dataset 72 | if loss_fn is None: 73 | self.loss_fn = LabelSmoothingLoss(tokenizer.vocab_size, tokenizer.padding_token_id) 74 | else: 75 | self.loss_fn = loss_fn 76 | self.optimizer, self.lr_scheduler = optimizers 77 | 78 | def get_data_loader(self, dataset, batch_size, shuffle=True) -> DataLoader: 79 | dataloader_params = { 80 | "batch_size": batch_size, 81 | "collate_fn": self.data_collator, 82 | "sampler": RandomSampler(dataset) if shuffle else SequentialSampler(dataset), 83 | } 84 | return DataLoader(dataset, **dataloader_params) 85 | 86 | def compute_loss(self, output, tgt): 87 | if self.loss_fn is not None: 88 | return self.loss_fn(output, tgt) 89 | else: 90 | return nn.CrossEntropyLoss()(output, tgt) 91 | 92 | def training_step(self, model, batch): 93 | model.train() 94 | output = model.forward(**batch) 95 | output = output.reshape(-1, output.size(-1)) 96 | labels = batch["label"].reshape(-1) 97 | loss = self.compute_loss(output, labels) 98 | print(loss) 99 | 100 | del batch 101 | torch.cuda.empty_cache() 102 | 103 | loss.backward() 104 | 105 | return loss.detach() / self.args.gradient_accumulation_steps 106 | 107 | def evaluation_loop(self, model, dataloader): 108 | model.eval() 109 | 110 | total_loss = 0 111 | for step, batch in enumerate(dataloader): 112 | with torch.no_grad(): 113 | output = model.forward(**batch) 114 | loss = self.compute_loss(output, batch["label"]) 115 | total_loss += loss.item() 116 | 117 | del batch 118 | torch.cuda.empty_cache() 119 | 120 | return total_loss / len(dataloader) 121 | 122 | def train(self): 123 | 124 | start_time = time.time() 125 | tr_loss = torch.tensor(0.0).to(self.args.device) 126 | total_steps = 0 127 | self.model.zero_grad() 128 | 129 | self.train_dataloader = self.get_data_loader(self.train_dataset, self.args.train_batch_size) 130 | if self.eval_dataset is not None: 131 | self.eval_dataloader = self.get_data_loader(self.eval_dataset, self.args.eval_batch_size, shuffle=False) 132 | 133 | for epoch in range(self.args.epochs): 134 | for batch in self.train_dataloader: 135 | tr_loss_step = self.training_step(self.model, batch) 136 | tr_loss += tr_loss_step 137 | 138 | if (total_steps + 1) % self.args.gradient_accumulation_steps == 0: 139 | self.optimizer.step() 140 | self.optimizer.zero_grad() 141 | self.lr_scheduler.step() 142 | self.model.zero_grad() 143 | 144 | total_steps += 1 145 | 146 | if total_steps % self.args.logging_steps == 0: 147 | train_loss = tr_loss.item() / total_steps 148 | print(f"Epoch {epoch} | Steps {total_steps} | Loss: {train_loss} | Time: {time.time() - start_time}") 149 | 150 | if self.eval_dataset and total_steps % self.args.evaluation_steps == 0: 151 | eval_loss = self.evaluation_loop(self.model, self.eval_dataloader) 152 | print(f"Validation Loss: {eval_loss} | Time: {time.time() - start_time}") 153 | -------------------------------------------------------------------------------- /tiny_transformer/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def subsequent_mask(size): 4 | "Mask out subsequent positions." 5 | attn_shape = (1, size, size) 6 | subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type( 7 | torch.uint8 8 | ) 9 | return subsequent_mask == 0 --------------------------------------------------------------------------------