├── .gitignore
├── NLP
    └── word2vec
    │   └── word2vec.py
├── README.md
├── content_embedding
    └── bert_whitening
    │   └── all_utils.py
├── halite-banner.gif
├── inference
    └── onnxruntime_cpp
    │   ├── bert_onnx_cpp_test.cpp
    │   └── bert_onxx_test.ipynb
├── notes
    ├── CS224N-2019
    │   ├── CS224N-01-Introduction-and-Word-Vectors.md
    │   └── img
    │   │   ├── 2020-06-12-12-42-12.png
    │   │   ├── 2020-06-12-13-22-31.png
    │   │   ├── 2020-06-12-13-24-36.png
    │   │   ├── 2020-06-12-15-11-11.png
    │   │   ├── 2020-06-12-15-37-09.png
    │   │   ├── 2020-06-12-15-42-22.png
    │   │   ├── 微信图片_20200612171355.jpg
    │   │   └── 微信截图_20200612183129.png
    └── Word2Vec学习笔记（CS224N笔记及相关论文学习）.md
├── text_classification
    ├── examples
    │   └── test_demo.py
    ├── models
    │   ├── BaseModel.py
    │   ├── FastText.py
    │   ├── TextBiLSTM.py
    │   ├── TextCNN.py
    │   ├── TextRCNN.py
    │   ├── TextRNN.py
    │   └── __init__.py
    ├── online
    │   └── utils
    │   │   ├── ckpt2pb.py
    │   │   └── ckpt2save.py
    ├── text_data
    │   └── raw_data
    │   │   ├── test.txt
    │   │   └── train.csv
    └── utils
    │   ├── __init__.py
    │   ├── data_helper.py
    │   └── generate_w2v.py
├── text_matching
    ├── esim
    │   ├── ESIM.py
    │   └── SoftAttention.py
    └── sentence-bert
    │   └── SBERT.py
└── tiny_transformer
    ├── configuration.py
    ├── dataset.py
    ├── model.py
    ├── test_model.py
    ├── tests
        └── dataset.py
    ├── train.py
    ├── train_data.py
    ├── trainer.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | .ipynb_checkpoints
  3 | *.pyc
  4 | __pycache__/
  5 | .idea/
  6 | *.zip
  7 | data/
  8 | checkpoints/
  9 | input_data/
 10 | w2v_model/
 11 | model_save/
 12 | pb_model/
 13 | save_model/
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | pip-wheel-metadata/
 38 | share/python-wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .nox/
 58 | .coverage
 59 | .coverage.*
 60 | .cache
 61 | nosetests.xml
 62 | coverage.xml
 63 | *.cover
 64 | *.py,cover
 65 | .hypothesis/
 66 | .pytest_cache/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | local_settings.py
 75 | db.sqlite3
 76 | db.sqlite3-journal
 77 | 
 78 | # Flask stuff:
 79 | instance/
 80 | .webassets-cache
 81 | 
 82 | # Scrapy stuff:
 83 | .scrapy
 84 | 
 85 | # Sphinx documentation
 86 | docs/_build/
 87 | 
 88 | # PyBuilder
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # IPython
 95 | profile_default/
 96 | ipython_config.py
 97 | 
 98 | # pyenv
 99 | .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
109 | __pypackages__/
110 | 
111 | # Celery stuff
112 | celerybeat-schedule
113 | celerybeat.pid
114 | 
115 | # SageMath parsed files
116 | *.sage.py
117 | 
118 | # Environments
119 | .env
120 | .venv
121 | env/
122 | venv/
123 | ENV/
124 | env.bak/
125 | venv.bak/
126 | 
127 | # Spyder project settings
128 | .spyderproject
129 | .spyproject
130 | 
131 | # Rope project settings
132 | .ropeproject
133 | 
134 | # mkdocs documentation
135 | /site
136 | 
137 | # mypy
138 | .mypy_cache/
139 | .dmypy.json
140 | dmypy.json
141 | 
142 | # Pyre type checker
143 | .pyre/


--------------------------------------------------------------------------------
/NLP/word2vec/word2vec.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/NLP/word2vec/word2vec.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | ![from kaggle](halite-banner.gif)    
 4 | <h2>NLP-Space</h2>
 5 | <p>papers read & learning notes & some code</p>
 6 | 
 7 | </div>
 8 | 
 9 | ---
10 | 
11 | ### Papers
12 | 
13 | | Model | Title | Resources | Remarks |
14 | |-------|----------|------------|------|
15 | |Word2Vec|Efficient Estimation of Word Representations in Vector Space|[[paper]](https://arxiv.org/pdf/1301.3781.pdf)|------|
16 | |negative sampling|Distributed Representations of Words and Phrases and their Compositionality |[[paper]](https://arxiv.org/abs/1310.4546)|------|
17 | |Transformer|Attention Is All You Need|[[paper]](https://arxiv.org/abs/1706.03762)|Google2017|
18 | |Bert|Pre-training of Deep Bidirectional Transformers for Language Understanding|[[paper](https://arxiv.org/abs/1810.04805)]|Google2018|
19 | 
20 | 
21 | ### Learning-Notes
22 | 
23 | [【斯坦福CS224N学习笔记】01-Introduction and Word Vectors](https://zhuanlan.zhihu.com/p/147889351)  
24 | [Word2Vec学习笔记（SVD、原理推导）](https://zhuanlan.zhihu.com/p/148779268)
25 | 
26 | 
27 | ### Text Classification
28 | * [x] Utils
29 |     * [x] [generate_w2v](./text_classification/utils/generate_w2v.py): train word embedding using gensim.
30 |     * [x] [data_helper](./text_classification/utils/data_helper.py): load datasets and data clearning, split to train and valid data.
31 | * [x] [BaseModel](./text_classification/models/BaseModel.py): a base model, including parameters initialization, embedding initialization, loss function and accuracy, some base api like compile, fit and predict. etc.
32 | * [x] [FastText](./text_classification/models/FastText.py)
33 | * [x] [TextCNN](./text_classification/models/TextCNN.py)
34 | * [x] [TextRNN](./text_classification/models/TextRNN.py)
35 | * [x] [TextBiLSTM](./text_classification/models/TextBiLSTM.py)
36 | * [ ] [TextRCNN](./text_classification/models/TextRCNN.py)
37 | * [ ] HAN
38 | * [ ] BiLSTM+Attention
39 | * [ ] Transformer
40 | * [ ] ...
41 | 
42 | ### NER
43 | 
44 | * [ ] BiLSTM+CRF
45 | * [ ] Bert+CRF
46 | * [ ] Bert+BiLSTM+CRF
47 | 
48 | ### Content Embedding
49 | 
50 | * [x] Bert-Whitening
51 | * [x] Sentence-Bert
52 | * [x] SimCSE
53 | * [ ] ESimCSE
54 | 
55 | ### Text Matching
56 | 
57 | * [ ] Siamese LSTM
58 | * [ ] DSSM
59 | * [x] ESIM
60 | * [ ] DIIN
61 | 
62 | ### Text Generation
63 | 
64 | * [ ]
65 | 
66 | ### Inference
67 | 
68 | * [x] ONNX (OnnxRuntime by CPP)
69 | * [ ] TensorRT


--------------------------------------------------------------------------------
/content_embedding/bert_whitening/all_utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import os
  3 | import sys
  4 | import torch
  5 | import numpy as np
  6 | from transformers import BertModel, BertTokenizer
  7 | from tqdm import tqdm
  8 | import scipy.stats
  9 | import pickle
 10 | import requests
 11 | 
 12 | 
 13 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 14 | 
 15 | 
 16 | def build_model(name):
 17 |     tokenizer = BertTokenizer.from_pretrained(name)
 18 |     model = BertModel.from_pretrained(name)
 19 |     model = model.to(DEVICE)
 20 |     return tokenizer, model
 21 | 
 22 | 
 23 | def sent_to_vec(sent, tokenizer, model, pooling, max_length):
 24 |     with torch.no_grad():
 25 |         inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True,  max_length=max_length)
 26 |         inputs['input_ids'] = inputs['input_ids'].to(DEVICE)
 27 |         inputs['token_type_ids'] = inputs['token_type_ids'].to(DEVICE)
 28 |         inputs['attention_mask'] = inputs['attention_mask'].to(DEVICE)
 29 | 
 30 | #         hidden_states = model(**inputs, return_dict=True, output_hidden_states=True).hidden_states
 31 |         outputs = model(**inputs, output_hidden_states=True)
 32 |         hidden_states = outputs[2]
 33 |         
 34 |         if pooling == 'first_last_avg':
 35 |             output_hidden_state = (hidden_states[-1] + hidden_states[1]).mean(dim=1)
 36 |         elif pooling == 'last_avg':
 37 |             output_hidden_state = (hidden_states[-1]).mean(dim=1)
 38 |         elif pooling == 'last2avg':
 39 |             output_hidden_state = (hidden_states[-1] + hidden_states[-2]).mean(dim=1)
 40 |         elif pooling == 'cls':
 41 | #             output_hidden_state = (hidden_states[-1])[:, 0, :]
 42 |             output_hidden_state = outputs[1]
 43 |         else:
 44 |             raise Exception("unknown pooling {}".format(POOLING))
 45 | 
 46 |         vec = output_hidden_state.cpu().numpy()
 47 |     return vec
 48 | 
 49 | 
 50 | def sents_to_vecs(sents, tokenizer, model, pooling, max_length, batch_size=64):
 51 |     vecs = []
 52 |     if batch_size:
 53 |         for i in tqdm(range(int(len(sents) / batch_size)+1)):
 54 |             m, n = i*batch_size, (i+1)*batch_size
 55 |             sent = sents[m:n]
 56 |             vec = sent_to_vec(sent, tokenizer, model, pooling, max_length)
 57 |             vecs.append(vec)
 58 |         vecs = np.concatenate(vecs)
 59 |         assert len(sents) == vecs.shape[0] 
 60 |     else:
 61 |         for sent in tqdm(sents):
 62 |             vec = sent_to_vec(sent, tokenizer, model, pooling, max_length)
 63 |             vecs.append(vec[0])
 64 |         assert len(sents) == len(vecs)
 65 |         vecs = np.array(vecs)
 66 |     return vecs
 67 | 
 68 | 
 69 | def calc_spearmanr_corr(x, y):
 70 |     return scipy.stats.spearmanr(x, y).correlation
 71 | 
 72 | 
 73 | # def compute_kernel_bias(vecs):
 74 | #     """计算kernel和bias
 75 | #     最后的变换：y = (x + bias).dot(kernel)
 76 | #     """
 77 | #     vecs = np.concatenate(vecs, axis=0)
 78 | #     mu = vecs.mean(axis=0, keepdims=True)
 79 | #     cov = np.cov(vecs.T)
 80 | #     u, s, vh = np.linalg.svd(cov)
 81 | #     W = np.dot(u, np.diag(1/np.sqrt(s)))
 82 | #     return W, -mu
 83 | 
 84 | 
 85 | def compute_kernel_bias(vecs, n_components):
 86 |     """计算kernel和bias
 87 |     最后的变换：y = (x + bias).dot(kernel)
 88 |     """
 89 |     vecs = np.concatenate(vecs, axis=0)
 90 |     mu = vecs.mean(axis=0, keepdims=True)
 91 |     cov = np.cov(vecs.T)
 92 |     u, s, vh = np.linalg.svd(cov)
 93 |     W = np.dot(u, np.diag(s**0.5))
 94 |     W = np.linalg.inv(W.T)
 95 |     W = W[:, :n_components]
 96 |     return W, -mu
 97 | 
 98 | 
 99 | def save_whiten(path, kernel, bias):
100 |     whiten = {
101 |         'kernel': kernel,
102 |         'bias': bias
103 |     }
104 |     with open(path, 'wb') as f:
105 |         pickle.dump(whiten, f)
106 |     return path
107 |     
108 | 
109 | def load_whiten(path):
110 |     with open(path, 'rb') as f:
111 |         whiten = pickle.load(f)
112 |     kernel = whiten['kernel']
113 |     bias = whiten['bias']
114 |     return kernel, bias
115 | 
116 | 
117 | def transform_and_normalize(vecs, kernel, bias):
118 |     """应用变换，然后标准化
119 |     """
120 |     if not (kernel is None or bias is None):
121 |         vecs = (vecs + bias).dot(kernel)
122 |     return normalize(vecs)
123 | 
124 | 
125 | def normalize(vecs):
126 |     """标准化
127 |     """
128 |     return vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5
129 | 


--------------------------------------------------------------------------------
/halite-banner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/halite-banner.gif


--------------------------------------------------------------------------------
/inference/onnxruntime_cpp/bert_onnx_cpp_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <onnxruntime_cxx_api.h>
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <assert.h>
  5 | #include <chrono>
  6 | 
  7 | #define USE_CPU // Chnage USE_CPU to USE_CUDA
  8 | 
  9 | #ifdef USE_CUDA
 10 | #include "cuda_provider_factory.h"
 11 | #endif  // CUDA GPU Enabled
 12 | 
 13 | // export LD_LIBRARY_PATH=${}/onnxruntime-linux-x64-1.9.0/lib:$LD_LIBRARY_PATH
 14 | // export LD_LIBRARY_PATH=${}/onnxruntime-linux-x64-gpu-1.9.0/lib:$LD_LIBRARY_PATH
 15 | // g++ a.cpp -o a ${}/onnxruntime-linux-x64-1.9.0/lib/libonnxruntime.so.1.9.0 -I ${}/onnxruntime-linux-x64-1.9.0/include/ -std=c++11
 16 | // g++ a.cpp -o a ${}/onnxruntime-linux-x64-gpu-1.9.0/lib/libonnxruntime.so.1.9.0 -I ${}/onnxruntime-linux-x64-gpu-1.9.0/include/ -std=c++11
 17 | 
 18 | 
 19 | int main() {
 20 |     int round = 1000;
 21 |     std::cout << round << std::endl;
 22 |     Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
 23 |     Ort::SessionOptions session_options;
 24 |     // session_options.SetIntraOpNumThreads(1);
 25 |     // session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
 26 | 
 27 | #ifdef USE_CUDA
 28 | 	Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
 29 | #endif  // CUDA GPU Enabled
 30 | 
 31 |     const char* model_path = "./mybert.onnx";
 32 |     Ort::Session session(env, model_path, session_options);
 33 | 
 34 |     //// print model input layer (node names, types, shape etc.)
 35 |     Ort::AllocatorWithDefaultOptions allocator;
 36 |     char* output_name = session.GetOutputName(0, allocator);
 37 |     std::cout << output_name << std::endl;
 38 | 
 39 |     std::vector<const char*> input_node_names = {"input_ids", "token_type_ids", "attention_mask"};
 40 |     std::vector<const char*> output_node_names = {"logits"};
 41 | 
 42 |     // input_ids
 43 |     std::vector<int64_t> input_ids_dims = {1, 82};
 44 |     size_t input_ids_size = 1 * 82; 
 45 |     auto memory_info_1 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
 46 |     // std::vector<long> input_ids_value = {101, 1037, 3899, 2003, 2770, 2006, 3509,  102};
 47 |     // std::vector<long> input_ids_value = {101 ,1037 ,3899 ,2003 ,2770 ,2006 ,3509 ,102 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0};
 48 |     std::vector<int64_t> input_ids_value = {101, 11724,  8762, 12126,  8168,   150,  8179, 10006, 10600, 10168, 10614,  9738,  9107,  8847,  9479, 11839,  8521,  8361, 10168, 11014, 8217,  9568,  9116,  8809,  9470, 12183,  8877,  9145, 11233,  9428,8134, 11104, 12729,  8913, 11057,  9202,  9374,  8139,  9392,  8154,8231,  8606, 12126,  8168,   150,  8179, 10006, 10600,  8346,  8998,9019, 11685,  8797,  9749,  8675, 10447,  8328, 11399,  9796, 11588,8180, 10091,  9786,  8165, 11399, 10537, 10367, 10242,  8178, 10484,12619, 12465, 10361,  8178,  8343,  9531,  8171, 12280,  8317,  9194,8736,   102};
 49 |     Ort::Value input_ids = Ort::Value::CreateTensor<int64_t>(memory_info_1, input_ids_value.data(), input_ids_size, input_ids_dims.data(), 2);
 50 |     assert(input_ids.IsTensor());
 51 |     // token_type_ids
 52 |     std::vector<int64_t> token_type_ids_dims = {1, 82};
 53 |     size_t token_type_ids_size = 1 * 82; 
 54 |     auto memory_info_2 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
 55 |     // std::vector<long> token_type_ids_value = {0, 0, 0, 0, 0, 0, 0, 0};
 56 |     std::vector<int64_t> token_type_ids_value;
 57 |     for (int i = 0; i < 82; ++ i) {
 58 |         token_type_ids_value.push_back(0);
 59 |     }
 60 |     Ort::Value token_type_ids = Ort::Value::CreateTensor<int64_t>(memory_info_2, token_type_ids_value.data(), token_type_ids_size, token_type_ids_dims.data(), 2);
 61 |     assert(token_type_ids.IsTensor());
 62 |     // attention_mask
 63 |     std::vector<int64_t> attention_mask_dims = {1, 82};
 64 |     size_t attention_mask_size = 1 * 82; 
 65 |     auto memory_info_3 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
 66 |     // std::vector<long> attention_mask_value = {1, 1, 1, 1, 1, 1, 1, 1};
 67 |     std::vector<int64_t> attention_mask_value;
 68 |     for (int i = 0; i < 82; ++ i) {
 69 |         attention_mask_value.push_back(1);
 70 |     }
 71 |     Ort::Value attention_mask = Ort::Value::CreateTensor<int64_t>(memory_info_3, attention_mask_value.data(), attention_mask_size, attention_mask_dims.data(), 2);
 72 |     assert(attention_mask.IsTensor());
 73 | 
 74 |     std::vector<Ort::Value> ort_inputs;
 75 |     ort_inputs.push_back(std::move(input_ids));
 76 |     ort_inputs.push_back(std::move(token_type_ids));
 77 |     ort_inputs.push_back(std::move(attention_mask));
 78 | 
 79 |     // test time
 80 |     auto begin = std::chrono::high_resolution_clock::now();
 81 |     for (int i = 0; i < round; ++ i) {
 82 |         session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), ort_inputs.data(), ort_inputs.size(), output_node_names.data(), 1);
 83 |     }
 84 |     auto end = std::chrono::high_resolution_clock::now();
 85 |     auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
 86 |     printf("time cost: %.3f seconds\n", elapsed.count() * 1e-9);
 87 |     // auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_node_names.data(), ort_inputs.data(), ort_inputs.size(), output_node_names.data(), 2);
 88 |   
 89 |     // Get pointer to output tensor float values
 90 |     // auto type_info = output_tensors[1].GetTensorTypeAndShapeInfo();
 91 |     // for (auto x: type_info.GetShape())
 92 |     //     std::cout << "shape " << x << std::endl;
 93 |     // std::cout << "len " << type_info.GetElementCount() << std::endl;
 94 |     // float* sequence = output_tensors[0].GetTensorMutableData<float>();
 95 |     // float* pooled = output_tensors[1].GetTensorMutableData<float>();
 96 |     // for (size_t i = 0; i != type_info.GetElementCount(); ++ i) {
 97 |     //     std::cout << pooled[i] << " ";
 98 |     // }
 99 |     // std::cout << pooled[0] << std::endl;
100 |     
101 | 
102 |     return 0;
103 | }


--------------------------------------------------------------------------------
/inference/onnxruntime_cpp/bert_onxx_test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import sys\n",
 11 |     "import pickle\n",
 12 |     "import time\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd\n",
 15 |     "import random\n",
 16 |     "\n",
 17 |     "import torch\n",
 18 |     "import torch.nn as nn\n",
 19 |     "import torch.nn.functional as F\n",
 20 |     "from torch.utils.data import Dataset,DataLoader\n",
 21 |     "import transformers\n",
 22 |     "from transformers import BertPreTrainedModel,BertModel,BertForSequenceClassification,BertTokenizer"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "model_path='/Users/zhangsongpo/Downloads/bert-base-chinese'\n",
 32 |     "max_length = 256"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "tokenizer = BertTokenizer.from_pretrained(model_path)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 4,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "class MyBert(nn.Module):\n",
 51 |     "    def __init__(self, num_labels):\n",
 52 |     "        super().__init__()\n",
 53 |     "        self.num_labels = num_labels\n",
 54 |     "\n",
 55 |     "        self.bert = BertModel.from_pretrained(model_path)\n",
 56 |     "        classifier_dropout = 0.2\n",
 57 |     "        self.dropout = nn.Dropout(classifier_dropout)\n",
 58 |     "        self.classifier = nn.Linear(768, num_labels)\n",
 59 |     "\n",
 60 |     "        \n",
 61 |     "    def forward(\n",
 62 |     "        self,\n",
 63 |     "        input_ids=None,\n",
 64 |     "        attention_mask=None,\n",
 65 |     "        token_type_ids=None,\n",
 66 |     "        position_ids=None,\n",
 67 |     "        head_mask=None,\n",
 68 |     "        inputs_embeds=None,\n",
 69 |     "        output_attentions=None,\n",
 70 |     "        output_hidden_states=None,\n",
 71 |     "    ):\n",
 72 |     "        outputs = self.bert(\n",
 73 |     "            input_ids,\n",
 74 |     "            attention_mask=attention_mask,\n",
 75 |     "            token_type_ids=token_type_ids,\n",
 76 |     "            position_ids=position_ids,\n",
 77 |     "            head_mask=head_mask,\n",
 78 |     "            inputs_embeds=inputs_embeds,\n",
 79 |     "            output_attentions=output_attentions,\n",
 80 |     "            output_hidden_states=output_hidden_states,\n",
 81 |     "        )\n",
 82 |     "\n",
 83 |     "        pooled_output = outputs[1]\n",
 84 |     "\n",
 85 |     "        pooled_output = self.dropout(pooled_output)\n",
 86 |     "        logits = self.classifier(pooled_output)\n",
 87 |     "\n",
 88 |     "        return logits"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stderr",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "Some weights of the model checkpoint at /Users/zhangsongpo/Downloads/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']\n",
101 |       "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
102 |       "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "my_bert = MyBert(num_labels=2)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "tmp_text = 'guideline hand hygiene health care setting recommendation healthcare infection control practice advisory committee hicpac shea ape idsa hand hygiene task force prepared john boyce md didier pittet md hospital saint raphael new haven connecticut university geneva geneva switzerland material report originate national center infectious disease james hughes md director division healthcare quality promotion steve solomon md acting director summary guideline hand hygiene health care setting health care worker hcw review data regarding handwash hand antisepsi health care setting addition specific recommendation promote improve hand hygiene practice reduce transmission pathogenic microorganism patient personnel health care setting report review study publish cdc guideline garner js favero cdc guideline handwash hospital environmental control infect control ape guideline larson el ape guidelines committee ape guideline handwash hand antisepsi health care setting infect control issue depth review hand hygiene practice hcw level adherence personnel recommend handwash practice factor adverse affecting adherence new study vivo efficacy alcohol base hand rub low incidence dermatitis associate use review recent study demonstrate value multidisciplinary hand hygiene promotion program potential role alcohol base hand rub improve hand hygiene practice summarize recommendation concerning related issue e use surgical hand antiseptic hand lotion cream wearing artificial fingernail part review scientific data regarding hand hygiene guideline hand hygiene health care setting recommendation healthcare infection control practice advisory committee hicpac shea ape idsa hand hygiene task force prepared john boyce md didier pittet md hospital saint raphael new haven connecticut university geneva geneva switzerland material report originate national center infectious disease james hughes md director division healthcare quality promotion steve'\n",
117 |     "sample_text = []\n",
118 |     "for _ in range(100):\n",
119 |     "    start_index = random.randint(0,200)\n",
120 |     "    text_len = random.randint(45, 500)\n",
121 |     "    sample_text.append(tmp_text[start_index:(start_index+text_len)])"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 30,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "inputs = tokenizer(text=tmp_text[:256],\n",
131 |     "                   return_tensors=\"pt\",\n",
132 |     "                   padding=True,\n",
133 |     "                   truncation=True,\n",
134 |     "                   max_length=max_length\n",
135 |     "                  )"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 33,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/plain": [
146 |        "tensor([[  101, 11724,  8762, 12126,  8168,   150,  8179, 10006, 10600, 10168,\n",
147 |        "         10614,  9738,  9107,  8847,  9479, 11839,  8521,  8361, 10168, 11014,\n",
148 |        "          8217,  9568,  9116,  8809,  9470, 12183,  8877,  9145, 11233,  9428,\n",
149 |        "          8134, 11104, 12729,  8913, 11057,  9202,  9374,  8139,  9392,  8154,\n",
150 |        "          8231,  8606, 12126,  8168,   150,  8179, 10006, 10600,  8346,  8998,\n",
151 |        "          9019, 11685,  8797,  9749,  8675, 10447,  8328, 11399,  9796, 11588,\n",
152 |        "          8180, 10091,  9786,  8165, 11399, 10537, 10367, 10242,  8178, 10484,\n",
153 |        "         12619, 12465, 10361,  8178,  8343,  9531,  8171, 12280,  8317,  9194,\n",
154 |        "          8736,   102]])"
155 |       ]
156 |      },
157 |      "execution_count": 33,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "inputs['input_ids']"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 26,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "0.29541802406311035\n",
176 |       "CPU times: user 249 ms, sys: 53.6 ms, total: 303 ms\n",
177 |       "Wall time: 296 ms\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "%%time\n",
183 |     "s0 = time.time()\n",
184 |     "res = my_bert(**inputs)\n",
185 |     "print(time.time() - s0)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 27,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "ename": "KeyboardInterrupt",
195 |      "evalue": "",
196 |      "output_type": "error",
197 |      "traceback": [
198 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
199 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
200 |       "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
201 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
202 |       "\u001b[0;32m<ipython-input-4-d3e7c2ab6f8f>\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions, output_hidden_states)\u001b[0m\n\u001b[1;32m     29\u001b[0m             \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m             \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     32\u001b[0m         )\n\u001b[1;32m     33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
203 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
204 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1007\u001b[0m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1008\u001b[0m             \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1009\u001b[0;31m             \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1010\u001b[0m         )\n\u001b[1;32m   1011\u001b[0m         \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mencoder_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
205 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
206 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    590\u001b[0m                     \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    591\u001b[0m                     \u001b[0mpast_key_value\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 592\u001b[0;31m                     \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    593\u001b[0m                 )\n\u001b[1;32m    594\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
207 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
208 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m    512\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    513\u001b[0m         layer_output = apply_chunking_to_forward(\n\u001b[0;32m--> 514\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunk_size_feed_forward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq_len_dim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    515\u001b[0m         )\n\u001b[1;32m    516\u001b[0m         \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlayer_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
209 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/modeling_utils.py\u001b[0m in \u001b[0;36mapply_chunking_to_forward\u001b[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[1;32m   2359\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_chunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2360\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2361\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mforward_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput_tensors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
210 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mfeed_forward_chunk\u001b[0;34m(self, attention_output)\u001b[0m\n\u001b[1;32m    524\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    525\u001b[0m         \u001b[0mintermediate_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintermediate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 526\u001b[0;31m         \u001b[0mlayer_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintermediate_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    527\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mlayer_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
211 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
212 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, input_tensor)\u001b[0m\n\u001b[1;32m    437\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    438\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 439\u001b[0;31m         \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    440\u001b[0m         \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    441\u001b[0m         \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLayerNorm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0minput_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
213 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    726\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
214 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/modules/linear.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m     91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     92\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     94\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mextra_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
215 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mlinear\u001b[0;34m(input, weight, bias)\u001b[0m\n\u001b[1;32m   1690\u001b[0m         \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maddmm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1691\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1692\u001b[0;31m         \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatmul\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1693\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mbias\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1694\u001b[0m             \u001b[0moutput\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
216 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "%%time\n",
222 |     "s0 = time.time()\n",
223 |     "for _ in sample_text:\n",
224 |     "    res = my_bert(**inputs)\n",
225 |     "all_time = time.time() - s0\n",
226 |     "print(all_time, all_time / len(sample_text))"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 19,
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "data": {
236 |       "text/plain": [
237 |        "tensor([[0.6906, 0.7076]], grad_fn=<SigmoidBackward>)"
238 |       ]
239 |      },
240 |      "execution_count": 19,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": []
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 9,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "output_names = ['logits']\n",
254 |     "dynamic_axes = {'input_ids': [0, 1],'attention_mask': [0, 1],'token_type_ids': [0, 1],}"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 10,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "name": "stderr",
264 |      "output_type": "stream",
265 |      "text": [
266 |       "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/torch/onnx/utils.py:1112: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input input_ids\n",
267 |       "  'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n",
268 |       "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/torch/onnx/utils.py:1112: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input attention_mask\n",
269 |       "  'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n",
270 |       "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/torch/onnx/utils.py:1112: UserWarning: No names were found for specified dynamic axes of provided input.Automatically generated names will be applied to each dynamic axes of input token_type_ids\n",
271 |       "  'Automatically generated names will be applied to each dynamic axes of input {}'.format(key))\n",
272 |       "/Users/zhangsongpo/miniconda3/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py:200: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
273 |       "  position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]\n"
274 |      ]
275 |     }
276 |    ],
277 |    "source": [
278 |     "torch.onnx.export(my_bert,\n",
279 |     "                  f='./mybert.onnx',\n",
280 |     "                  args=tuple(inputs.values()),\n",
281 |     "                  input_names=list(inputs),\n",
282 |     "                  output_names=output_names,\n",
283 |     "                  dynamic_axes=dynamic_axes,\n",
284 |     "                  opset_version=10)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 11,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "import onnx\n",
294 |     "\n",
295 |     "onnx_model = onnx.load('./mybert.onnx')\n",
296 |     "onnx.checker.check_model(onnx_model)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": []
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": []
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": []
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 34,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "import onnxruntime"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 35,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "onxx_model_path = r'./mybert.onnx'\n",
336 |     "options = onnxruntime.SessionOptions()\n",
337 |     "session = onnxruntime.InferenceSession(onxx_model_path, options)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 36,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "inputs = tokenizer(text=tmp_text[:256],\n",
347 |     "                   return_tensors=\"pt\",\n",
348 |     "                   padding=True,\n",
349 |     "                   truncation=True,\n",
350 |     "                   max_length=max_length\n",
351 |     "                  )\n",
352 |     "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in inputs.items()}"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 43,
358 |    "metadata": {
359 |     "scrolled": true
360 |    },
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "0.06377792358398438\n",
367 |       "CPU times: user 241 ms, sys: 2.97 ms, total: 244 ms\n",
368 |       "Wall time: 63.9 ms\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "%%time\n",
374 |     "s0 = time.time()\n",
375 |     "res = session.run(None, inputs_onnx)\n",
376 |     "print(time.time() - s0)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 42,
382 |    "metadata": {},
383 |    "outputs": [
384 |     {
385 |      "ename": "KeyboardInterrupt",
386 |      "evalue": "",
387 |      "output_type": "error",
388 |      "traceback": [
389 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
390 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
391 |       "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
392 |       "\u001b[0;32m~/miniconda3/lib/python3.6/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, output_names, input_feed, run_options)\u001b[0m\n\u001b[1;32m    186\u001b[0m             \u001b[0moutput_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_outputs_meta\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    187\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 188\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_feed\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_options\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    189\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEPFail\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    190\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_enable_fallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
393 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "%%time\n",
399 |     "s0 = time.time()\n",
400 |     "for _ in sample_text:\n",
401 |     "    res = session.run(None, inputs_onnx)\n",
402 |     "all_time = time.time() - s0\n",
403 |     "print(all_time, all_time / len(sample_text))"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": []
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 24,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "from os import environ\n",
420 |     "from psutil import cpu_count\n",
421 |     "\n",
422 |     "# Constants from the performance optimization available in onnxruntime\n",
423 |     "# It needs to be done before importing onnxruntime\n",
424 |     "environ[\"OMP_NUM_THREADS\"] = str(cpu_count(logical=True)) # OMP 的线程数\n",
425 |     "environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'\n",
426 |     "\n",
427 |     "from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers\n",
428 |     "\n",
429 |     "\n",
430 |     "def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: \n",
431 |     "  \n",
432 |     "    assert provider in get_all_providers(), f\"provider {provider} not found, {get_all_providers()}\"\n",
433 |     "\n",
434 |     "    # Few properties that might have an impact on performances (provided by MS)\n",
435 |     "    options = SessionOptions()\n",
436 |     "    options.intra_op_num_threads = 1\n",
437 |     "    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL\n",
438 |     "\n",
439 |     "  # Load the model as a graph and prepare the CPU backend \n",
440 |     "    session = InferenceSession(model_path, options, providers=[provider])\n",
441 |     "    session.disable_fallback()\n",
442 |     "    \n",
443 |     "    return session"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 25,
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "session_cpu = create_model_for_provider(onxx_model_path, \"CPUExecutionProvider\") # 使用 优化过的 onnx"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 27,
458 |    "metadata": {},
459 |    "outputs": [
460 |     {
461 |      "name": "stdout",
462 |      "output_type": "stream",
463 |      "text": [
464 |       "0.17477822303771973\n",
465 |       "CPU times: user 171 ms, sys: 4.02 ms, total: 175 ms\n",
466 |       "Wall time: 175 ms\n"
467 |      ]
468 |     }
469 |    ],
470 |    "source": [
471 |     "%%time\n",
472 |     "s0 = time.time()\n",
473 |     "res = session_cpu.run(None, inputs_onnx)\n",
474 |     "print(time.time() - s0)"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 28,
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "name": "stdout",
484 |      "output_type": "stream",
485 |      "text": [
486 |       "17.469280004501343 0.17469280004501342\n",
487 |       "CPU times: user 16.9 s, sys: 209 ms, total: 17.1 s\n",
488 |       "Wall time: 17.5 s\n"
489 |      ]
490 |     }
491 |    ],
492 |    "source": [
493 |     "%%time\n",
494 |     "s0 = time.time()\n",
495 |     "for _ in sample_text:\n",
496 |     "    res = session_cpu.run(None, inputs_onnx)\n",
497 |     "all_time = time.time() - s0\n",
498 |     "print(all_time, all_time / len(sample_text))"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": []
507 |   }
508 |  ],
509 |  "metadata": {
510 |   "kernelspec": {
511 |    "display_name": "Python 3",
512 |    "language": "python",
513 |    "name": "python3"
514 |   },
515 |   "language_info": {
516 |    "codemirror_mode": {
517 |     "name": "ipython",
518 |     "version": 3
519 |    },
520 |    "file_extension": ".py",
521 |    "mimetype": "text/x-python",
522 |    "name": "python",
523 |    "nbconvert_exporter": "python",
524 |    "pygments_lexer": "ipython3",
525 |    "version": "3.6.5"
526 |   }
527 |  },
528 |  "nbformat": 4,
529 |  "nbformat_minor": 4
530 | }
531 | 


--------------------------------------------------------------------------------
/notes/CS224N-2019/CS224N-01-Introduction-and-Word-Vectors.md:
--------------------------------------------------------------------------------
 1 | ## CS224N-01-Introduction and Word Vectors
 2 | 
 3 | **[CS224N Home](https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/)** 【Stanford NLP】
 4 | **[Video](https://www.bilibili.com/video/BV1r4411f7td)** 【Bilibili】
 5 | 
 6 | 第一课的内容主要包含两个方面，一个是介绍如何表达词的含义，从传统的词表示方法引入分布式语义表达，引出word vector，第二个方面是讲解了word2vec的原理，从word2vec的损失函数和计算推导方面剖析了word vector的原理。尤其是最后的损失函数推导计算，从公式方面解释了优化词向量的内涵。
 7 | 
 8 | ### 1、Human language and word meaning
 9 | 第一部分从传统的词表示引入到word vector，传统的表示方式是独热编码，由分布式语义产出词向量。
10 | * XKCD cartoon
11 | <img src="img/2020-06-12-12-42-12.png" width="40%" />
12 | 
13 | #### Definition: **meaning** (Webster dictionary)
14 | #### Common solution: **WordNet**
15 | #### Problems with resources like WordNet:
16 | * Great as a resource but missing nuance, 细微差别
17 | * missing new meaning of words, 单词含义
18 | * Subjective, 主观的
19 | * Requires human labor to create and adapt, 需要人工
20 | * Can't compute accurate word similarity, 无法计算相似度
21 | 
22 | #### Representing words as discrete symbols
23 | traditional NLP，a localist representation
24 | **Means one 1, the rest 0s**
25 | 独热编码（ont-hot）
26 | ```
27 | motel=[0 0 0 0 1 0]
28 | hotel=[0 1 0 0 0 0]
29 | ```
30 | 但是独热编码的结果是，这些词向量都是正交的，并且不能表达语义相似度。orthogonal（正交）、no natural notion of similarity
31 | 解决方案就是`learn to encode similarity in the vectors themselves`
32 | 
33 | #### Representing words by their context
34 | <img src="img/2020-06-12-13-22-31.png" width="40%" />
35 | 
36 | **Distributional semantics**: A word's meaning is given by the words that frequently appear close-by
37 | **Word vectors** (word embeddings): dense vector
38 | 
39 | #### Word meaning as as neural word vector - visualization
40 | <img src="img/2020-06-12-13-24-36.png" width="40%" />
41 | 
42 | ### 2、Word2vec: Overview
43 | **Word2vec (Mikolov et al. 2013) is a framework for learning word vectors.**
44 | **Idea:**
45 | * a large corpus of text，首先有一个语料库
46 | * 每个词给一个初始化的vector
47 | * 遍历text中的每个位置，包含了center word [c]和context words [o]
48 | * 根据c和o的词向量的相似度来计算，给出c得出o的似然
49 | * 调整优化word vectors来最小化似然
50 | 
51 | 图示：
52 | <img src="img/2020-06-12-15-11-11.png" width="70%" />
53 | 计算$P(w_{t+j}|w_t)$
54 | #### Word2vec: objective function
55 | 对于每一个text的位置$t=1,...,T$，给出中心词$w_j$，预测窗口为m内的上下文。
56 | 其似然值为：
57 | $$Likelihood=L(\theta)=\prod_{t=1}^T\prod_{-m\le{j}\le{m} \atop{j\ne0}}P(w_{t+j}|w_t)$$ $\theta$ is all variables to be optimized.
58 | 损失函数$J(\theta)$是（平均）负的对数似然，**negative log likelihood**:
59 | $$J(\theta)=-\frac{1}{T}logJ(\theta)=-\frac{1}{T}\sum_{t=1}^T\sum_{-m\le{j}\le{m} \atop{j\ne0}}P(w_{t+j}|w_t)$$**Minimizing objective function <==> Maxmizing predictive accuracy**
60 | 
61 | 想要最小化损失函数，首先要考虑怎么计算$P(w_{t+j}|w_t)$
62 | 对于每个词给定两个词向量
63 | * $v_w$，当w为中心词时
64 | * $u_w$，当w为上下文时
65 | 
66 | 对于每个中心词c和上下文词o，有：
67 | $$P(o|c)=\frac{exp(u_o^Tv_c)}{\sum_{w\in{V}}exp(u_w^Tv_c)}$$分子上的向量点乘表达的是两个词的相似度，分母是中心词和所有词的相似度（**注意：这里是所有词，后续优化**）
68 | <img src="img/2020-06-12-15-37-09.png" width="70%" />
69 | **softmax** function：为什么成为softmax
70 | <img src="img/2020-06-12-15-42-22.png" width="70%" />
71 | 
72 | #### To train the model: Compute all vector gradients
73 | $\theta$ represents all model parameters, in one long vector
74 | Remember: every word has two vectors
75 | <img src="https://img-blog.csdnimg.cn/20200612171751889.jpg" width="70%" />
76 | 上述推导中，$P(x|c)$是给定中心词 $c$，模型所给出的为 $x$ 的概率。
77 | 
78 | 这个推导结果很有趣！等号左边是给出中心词 $c$ 其上下文 $o$ 的对数概率的偏导，是我们要找的一个下降对快的一个方向，多维空间上的一个斜坡。等号右边的含义是，我们观察到的上下文的词 $o$ ，从中减去我们的模型认为的上下文的样子，后面一部分是模型的期望。实际的上下文与模型认为的上下文，这两者之间的差异决定了下降的方向。


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/2020-06-12-12-42-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-12-42-12.png


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/2020-06-12-13-22-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-13-22-31.png


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/2020-06-12-13-24-36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-13-24-36.png


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/2020-06-12-15-11-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-15-11-11.png


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/2020-06-12-15-37-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-15-37-09.png


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/2020-06-12-15-42-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/2020-06-12-15-42-22.png


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/微信图片_20200612171355.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/微信图片_20200612171355.jpg


--------------------------------------------------------------------------------
/notes/CS224N-2019/img/微信截图_20200612183129.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/notes/CS224N-2019/img/微信截图_20200612183129.png


--------------------------------------------------------------------------------
/notes/Word2Vec学习笔记（CS224N笔记及相关论文学习）.md:
--------------------------------------------------------------------------------
  1 | ﻿***[参考CS224N笔记](https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/readings/cs224n-2019-notes01-wordvecs1.pdf)
  2 | [The Skip-Gram Model](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/)
  3 | [word2vec paper](https://arxiv.org/pdf/1301.3781.pdf)
  4 | [negative sampling paper](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)***
  5 | 
  6 | @[toc]
  7 | ### NLP
  8 | &#8195;&#8195;人类语言是独特的传达含义的系统，不同于计算机视觉及其他的机器学习任务。
  9 | &#8195;&#8195;NLP领域有着不同难度等级的任务，从语音处理到语义解释等。NLP的目标是设计出算法令计算机“理解”自然语言以解决实际的任务。
 10 | - Easy的任务包括：拼写纠正、关键词搜索、同义词查找等；
 11 | - Medium的任务包括：信息解析等；
 12 | - Hard任务包括：机器翻译、情感分析、指代、问答系统等。
 13 | 
 14 | ### 1、Word Vectors
 15 | &#8195;&#8195;英语中估计有13 million单词，他们相互之间并不全是无关的，Feline to cat (猫科动物->猫)、hotel to motel (旅馆->汽车旅馆)等。我们希望用一些向量来编码每个单词，在同一词空间内以点的形式进行表示。直接的方法是构建一个$N(N\le13 million)$维度的空间，这个空间足够将我们的单词进行编码，每个维度可以编码某些我们语言的含义。这些维度可能表示时态、计数、性别等。
 16 | &#8195;&#8195;独热编码是直接的编码方法，将每个词表示为$\mathbb{R}^{|V|\times1}$向量，该词在固定顺序下的索引处为1，其他位置都为0。如下
 17 | <center><img src="https://img-blog.csdnimg.cn/20200613135818218.png" width=70% /></center>
 18 | &#8195;&#8195;每个单词是一个完全独立的个体，如上图所示，结果就是这种词表示方式不能表示任何相似之处，他们都是正交的：
 19 | 
 20 | $$(w^{hotel})^Tw^{motel}=(w^{hotel})^Tw^{cat}=0$$
 21 | 
 22 | &#8195;&#8195;我们可以尝试将$V$维减小，对原来表示空间进行降维，来寻找一个子空间来进行词关系的表示。
 23 | ### 2、SVD Based Methods
 24 | &#8195;&#8195;奇异值分解方法的做法是，我们首先遍历数据集，通过矩阵$X$存储单词出现的共现次数，然后对$X$进行奇异值分解得出$USV^t$分解。我们可以将$U$的行值可以作为词表中所有词的word embedding。
 25 | #### 2.1  Word-Document Matrix
 26 | &#8195;&#8195;首先我们可以认为，相关的词总会出现在同一个文档中。譬如，"bank"、"bongs"、"stocks"、"money"等更有可能同时出现，但是"bank"、"octopus"、"banana"等词不可能总是同时出现。我们利用这种共现现象构建一个word-document matrix：X。遍历大量的文档数据集，每当单词$i$和单词$j$同时出现时，我们就在$X_{ij}$位置加1。很明显这将是一个非常大的矩阵($\mathbb{R}^{|V|\times{M}}$)，其中$M$是文档的个数。
 27 | #### 2.2 Window based Co-occurrence Matrix（基于窗口的共现矩阵）
 28 | &#8195;&#8195;矩阵$X$存储着单词的共现次数。这里我们将在一个合适大小的窗口内来统计单词的共现次数。通过下面的例子进行说明，数据集中有三个句子，窗口大小设定为1：
 29 | ```
 30 | 1. I enjoy flying.
 31 | 2. I like NLP.
 32 | 3. I like deep learning.
 33 | ```
 34 | 根据窗口为1的设定，统计结果矩阵如下：
 35 | <center><img src="https://img-blog.csdnimg.cn/20200616100125825.png" width=70% /></center>
 36 | 
 37 | #### 2.3  奇异值分解
 38 | 通过SVD方法得到word embedding的过程如下：
 39 | * 构建$|V|\times{|V|}$的共现矩阵，$X$。
 40 | * 使用SVD得到，$X=USV^{T}$。
 41 | * 选择$U$的前$k$个维度，得到$k$维的词向量。
 42 | * $\frac{\sum_{i=1}^{k}\sigma_i}{\sum_{i=1}^{|V|}\sigma_i}$表示前$k$个维度的方差。
 43 | 
 44 | 我们现在对$X$进行SVD处理：
 45 | $X=USV^{T}$
 46 | <center><img src="https://img-blog.csdnimg.cn/2020061610175746.png" width=100% /></center>
 47 | 选择k维奇异值向量进行降维：
 48 | <center><img src="https://img-blog.csdnimg.cn/20200616102000454.png" width=100% /></center>
 49 | 
 50 | #### 2.4 SVD方法小结
 51 | &#8195;&#8195;以上的两种方法（Word-Document Matrix 和 Window based Co-occurrence Matrix）都比传统的编码形式有着跟多的语义信息，但是仍然存在着一些问题：
 52 | * 矩阵的维度大小不固定，会随新词的添加而变化，语料库大小也随之变化；
 53 | * 矩阵过于稀疏，大部分的单词不会同时出现；
 54 | * 矩阵维度太高（$\approx10^6\times{10^6}$）；
 55 | * 训练成本太高（$O(mn^2)$）；
 56 | * 需要加入一些隐含词（不知道这么理解对不对）来解决词频不均衡的问题。
 57 | 
 58 | 针对以上的问题有一些解决方法：
 59 | * 忽略一些词，例如"the"、"he"、"has"等；
 60 | * 窗口动态，即根据文档中单词之间的距离加权计算共现计数；
 61 | * 使用皮尔逊相关系数，Use Pearson correlation and set negative counts to 0 instead ofusing just raw count.
 62 | 
 63 | ### 3、Iteration Based Methods - Word2vec
 64 | &#8195;&#8195;我们尝试一种新得方法，通过构建模型能够迭代学习，最终可以根据给定的上下文来对单词的概率进行编码。这个方法设计出的模型的参数就是词向量。在每次的训练迭代过程中，计算误差，更新参数，最终学习出词向量。这个想法可以追溯到1986年，称之为“反向传播（backpropagating）”[[Rumelhart et al., 1988](#refer)]，模型任务越简单，训练速度越快。有一些方法也被尝试过，[[[Collobert et al., 2011](#refer)]构建了NLP模型，第一步是将每个词转为向量，对于每种任务（命名实体识别，词性标注等）不仅训练模型参数同时训练向量，在有不错的效果的同时也得到了好的词向量。
 65 | &#8195;&#8195;Word2vec是2013年Mikolov提出的简单有效的方法[[Mikolov et al., 2013](#refer)]（这种方法依赖于语言学中一个非常重要的假设，即分布相似，即相似的词有相似的语境。）Word2vec是一个算法包：
 66 | * 算法部分：continuous bag-of-words (CBOW) and skip-gram. CBOW是通过上下文预测中心词，Skip-gram相反，给定中心词预测上下文。
 67 | * 模型训练： negative sampling and hierarchical softmax. 负采样是采样出一定比例的负例，层次softmax是通过一种有效的霍夫曼树结构来计算词的概率。
 68 | 
 69 | #### 3.1 语言模型（unigrams，bigrams，trigrams等）
 70 | <center>"The cat jumped over the puddle."</center>
 71 | &#8195;&#8195;以上面的句子为例。
 72 | 
 73 | &#8195;&#8195;首先，我们需要构建一个模型来表示一个单词序列的概率。一个好的语言模型会给有效的好句子一个高的概率值，但是句子"stock boil fish is toy"的概率会很低，因为这不是一个正常有意义的句子。用数学来表达，当给定一个有$n$个单词的句子时，其概率为：
 74 | 
 75 | $$P(w_1,w_2,...,w_n)$$
 76 | 
 77 | 我们采用unigrams（一元模型），即每个单词都是独立的，则:
 78 | $$P(w_1,w_2,...,w_n)=\prod_{i=1}^{n}P(w_i)$$
 79 | 
 80 | &#8195;&#8195;这个表达式有个明显的问题就是，如果有一组句子，虽然他们有着同样的单词，有的句子有意义，有的句子是乱序无意义的，但是他们的概率确实一样的。因为我们的句子都是有序的，一个单词的概率很大程度上和上一个单词有关系。我们需要基于相邻的两个单词的概率来决定句子的概率，即bigrams（二元模型）：
 81 | $$P(w_1,w_2,...,w_n)=\prod_{i=2}^{n}P(w_i|w_{i-1})$$
 82 | 
 83 | 即使这样，我们考虑的也是两两相邻的单词，而不是整个句子。
 84 | #### 3.2 Continuous Bag of Words Model (CBOW)
 85 | &#8195;&#8195;对于上述的例子，我们通过上下文{"The"、"cat"、"over"、"the"、"puddle"}来预测或生成出中心词"jumped"，这种方式我们成为Continuous Bag of Words Model (CBOW)。
 86 | &#8195;&#8195;对于CBOW模型，首先我们设定已知参数，即将输入句子表示为一个one-hot形式的词向量。输入的one-hot向量表示为$x^{(c)}$，输出表示为$y^{(c)}$，CBOW模型只有一个输出，这个$y$为已知的中心词的one-hot向量。对于每个词，我们通过CBOW都可以学习出两个向量，
 87 | * $v$：input vector，当词为上下文时
 88 | * $u$：output vector，当词为中心词时
 89 | 
 90 | 首先介绍一些CBOW模型中涉及到的一些参数：
 91 | * $w_i$：词表$V$中的第$i$个词
 92 | * $\mathcal{V}\in{\mathbb{R}^{n\times{|V|}}}$：input word matrix
 93 | * $v_i$：$\mathcal{V}$中的第$i$行，表示的是$w_i$的输入向量
 94 | * $\mathcal{U}\in{\mathbb{R}^{|V|\times{n}}}$：output word matrix
 95 | * $u_i$：$\mathcal{U}$中的第$i$行，表示的是$w_i$的输出向量
 96 | 
 97 | &#8195;&#8195;我们构建两个矩阵$\mathcal{V}\in{\mathbb{R}^{n\times{|V|}}}$和$\mathcal{U}\in{\mathbb{R}^{|V|\times{n}}}$，其中$n$是我们定义的embedding空间的大小。具体的模型构建步骤如下：
 98 | 1. 首先我们根据窗口大小$m$确定我们的输入one-hot词向量：$(x^{(c-m)},...x^{(c-1)},x^{(c+1)},...,x^{(c+m)}\in{\mathbb{R}^{|V|}})$，中心词为$x^{(c)}$
 99 | 2. 得到对应的输入word embedding为$(v_{c-m}=\mathcal{Vx^{(c-m)}},v_{c-m+1}=\mathcal{Vx^{(c-m+1)}},...,v_{c+m}=\mathcal{Vx^{(c+m)}}\in{\mathbb{R}^{n}})$
100 | 3. 将这些向量平均得到$\hat{v}=\frac{v_{c-m}+v_{c-m+1}+...+v_{c+m}}{2m}\in{\mathbb{R}^{n}}$
101 | 4. 计算出分数向量$z=\mathcal{U}\hat{v}\in{\mathbb{R}^{|V|}}$，点乘计算的是两个向量的相似度，如果两个词比较接近，那么将会有一个较高的分数
102 | 5. 通过softmax将分数转为概率值，$\hat{y}=softmax(z)\in{\mathbb{R}^{|V|}}$
103 | 6. 我们希望生成的概率$\hat{y}$来匹配真实的概率$y$，即输出的对应的one-hot向量对应真实的单词
104 | 
105 | 如图展示了CBOW模型细节，我们需要学习出两个转换矩阵。
106 | <center><img src="https://img-blog.csdnimg.cn/20200616163354771.png" width=50% /></center>
107 | 
108 | &#8195;&#8195;我们需要学习出 $\mathcal{V}$ 和 $\mathcal{U}$ 这两个矩阵，首先确定目标函数。当我们试图从某个真实的概率中学习概率时，会考虑用信息论的方法来度量两个分布的距离，我们这里选用交叉熵（cross entropy）$H(\hat{y},y)$来作为目标函数：
109 | $$H(\hat{y},y)=-\sum_{j=1}^{|V|}y_jlog(\hat{y}_j)$$
110 | 
111 | $y$是一个one-hot向量，简化目标函数为：
112 | $$H(\hat{y},y)=-y_jlog(\hat{y}_j)$$
113 | 
114 | 因此我们优化目标为：
115 | <center><img src="https://img-blog.csdnimg.cn/20200616164500323.png" width=100% /></center>
116 | 
117 | 我们使用随机梯度下降来更新所有相关的词向量 $u_c$ 和 $v_j$。
118 | #### 3.3 Skip-Gram Model
119 | &#8195;&#8195;Skip-gram是给出中心词"jumped"，来预测或生成上下文词 "The", "cat", "over", "the", "puddle"。Skip-gram model大体上和COBW模型相似，不过我们需要将$x$与$y$互换，即这里输入的one-hot向量是一个，输出向量$y$是多个。我们同样定义两个矩阵 $\mathcal{V}$ 和 $\mathcal{U}$，模型构建步骤如下：
120 | 1. 首先生成中心词输入向量$x\in{\mathbb{R}^{|V|}}$
121 | 2. 得到中心词的embedding词向量 $v_c=\mathcal{V}x\in{\mathbb{R}^n}$
122 | 3. 生成分数向量$z=\mathcal{U}v_c$
123 | 4. 转为概率值 $\hat{y}=softmax(z)$，$\hat{y}_{c-m},...,\hat{y}_{c-1},\hat{y}_{c+1},...,\hat{y}_{c+m}$是每个上下文词的概率值
124 | 5. 目标是让概率分布与真实的接近
125 | <center><img src="https://img-blog.csdnimg.cn/20200616183729454.png" width=50% /></center>
126 | 和CBOW模型一样，我们需要确定目标函数，这里我们使用朴素贝叶斯估计来求解出结果。
127 | <center><img src="https://img-blog.csdnimg.cn/20200616183948527.png" width=100% /></center>
128 | 利用这个目标函数，我们可以计算出未知参数的梯度，并在每次迭代时通过随机梯度下降来更新它们。
129 | 注意到：
130 | <center><img src="https://img-blog.csdnimg.cn/20200616184141501.png" width=60% /></center>
131 | 
132 | 其中 $H(\hat{y},y_{c-m+j})$ 是概率分布向量 $\hat{y}$ 和one-hot向量 $y_{c-m+j}$ 的交叉熵。
133 | #### 3.4 Negative Sampling
134 | &#8195;&#8195;我们注意到目标函数中的 $|V|$ 的值是非常大的，结果就是每次更新或评估目标函数的时候我们都要花费 $O(|V|)$（计算softmax归一化的时候），一个简单的做法就是近似估计它就可以了。
135 | &#8195;&#8195;在每次训练的时候，我们不需要遍历所有的词表，只需要采样少数的负样本。我们基于噪声分布 $P_n(w)$ 采样，其采样概率和词频顺序相匹配。
136 | &#8195;&#8195;Negative Sampling见[paper](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)。负采样基于Skip-Gram模型，实际上是优化不同的目标。考虑中心词和上下文词对$(w,c)$，如果这个词对来自语料数据集，则概率为$P(D=1|w,c)$，相反，如果词对不是来自语料库的，则为$P(D=0|w,c)$，首先，利用sigmoid函数表示概率值：
137 | $$P(D=1|w,c,\theta)=\sigma(v_c^{T}v_w)=\frac{1}{1+e^{-v_c^{T}v_w}}$$
138 | 
139 | 我们现在构建一个新的目标函数，其目标是maximize两个概率值 $P(D=1|w,c,\theta)$ 和 $P(D=0|w,c,\theta)$，我们利用最大化似然来估计这两个概率分布（这么我们将$\theta$作为模型的参数，在这里表示是 $\mathcal{V}$ 和 $\mathcal{U}$）
140 | <center><img src="https://img-blog.csdnimg.cn/20200616190533719.png" width=100% /></center>
141 | 
142 | 等同于最小化负的对数似然：
143 | <center><img src="https://img-blog.csdnimg.cn/20200616190657796.png" width=100% /></center>
144 | 
145 | 公式中的$\tilde{D}$是负样本集。
146 | 对于skip-gram模型，对于给定中心词$c$和上下文词 $c-m+j$表示为：
147 | <center><img src="https://img-blog.csdnimg.cn/20200616191107600.png" width=70% /></center>
148 | 
149 | 对于CBOW模型，中心词为$u_c$，给定的上下文向量为$\hat{v}=\frac{v_{c-m}+v_{c-m+1}+...+v_{c+m}}{2m}$，目标函数为：
150 | <center><img src="https://img-blog.csdnimg.cn/20200616191307526.png" width=70% /></center>
151 | 
152 | &#8195;&#8195;现在讨论$P_n(w)$应该是什么。相关大量的讨论似乎是一元模型中的$3/4$次方是最优，为什么是$3/4$，如下：
153 | <center><img src="https://img-blog.csdnimg.cn/20200616191554471.png" width=50% /></center>
154 | 
155 | "bombastic"的抽样率变成了3倍，但是"is"只是增大了一点点。"is"是不重要的一类词，其出现的概率本来就很大，不需要对其增加很多采样。
156 | 
157 | #### 3.5 Hierarchical Softmax
158 | &#8195;&#8195;Mikolov同样提出了层次softmax来解决归一化softmax的问题。**在实际中，层次softmax对低频词汇有更好的效果，负采样对高频词和低维向量有着更好的效果。**
159 | <center><img src="https://img-blog.csdnimg.cn/20200616192331651.png" width=70% /></center>
160 | 
161 | &#8195;&#8195;层次softmax利用二叉树来表示词表中的所有词，树的每个叶子都是一个单词，从根到叶子节点只有唯一的一条路径。每个词没有输出表示，图的每个节点（除了根和叶）都是模型要学习的向量。
162 | &#8195;&#8195;在层次softmax中，单词$w$的向量为$w_i$。$P(w|w_i)$是从根随机游走到叶子节点$w$的概率。最大的优点就是这种计算概率的方式其成本为$O(log(|V|))$，与路径长度相关。
163 | &#8195;&#8195;令$L(w)$为从根到叶子$w$路径上的节点个数，令$n(w,i)$为路径上的第$i$个节点。因此，$n(w,1)$是根节点，$n(w,L(w))$表示的是节点$w$。对于每个节点$n$，我们可以选择其的一个孩子称为$ch(n)$（总是左节点）。我们计算$P(w|w_i)$为：
164 | <center><img src="https://img-blog.csdnimg.cn/20200616193346290.png" width=90% /></center>
165 | 其中：
166 | <center><img src="https://img-blog.csdnimg.cn/20200616193421686.png" width=40% /></center>
167 | 
168 | $\sigma(\cdot)$是sigmoid函数。
169 | &#8195;&#8195;分析上述的公式，首先，我们根据根到叶子节点的路径上各项的乘积。因为我们假设了$ch(n)$总是$n$的左节点，因此当路径游走到左节点时$[n(w,j+1)=ch(n(w,j))]$为1，游走到右边为-1。
170 | &#8195;&#8195;此外，$[n(w,j+1)=ch(n(w,j))]$是一种归一化的方式。对于节点$n$，计算游走到左边的概率和右边的概率，对于每个$v_n^Tv_{w_i}$的概率都是1：
171 | $$\sigma(v_n^Tv_{w_i})+\sigma(-v_n^Tv_{w_i})=1$$
172 | 
173 | 这样确保了$\sum_{w=1}^{|V|}P(w|w_i)=1$，这是原本的softmax。
174 | &#8195;&#8195;最后，我们比较利用点乘来比较输入向量$v_{w_i}$和每个内部的节点向量$v_{n(w,j)}^T$的相似度。对于二叉树图示中的例子来讲，$w_2$，我们需要从根部走两个左边和一个右边达到$w_2$：
175 | <center><img src="https://img-blog.csdnimg.cn/20200616194918228.png" width=100% /></center>
176 | 
177 | &#8195;&#8195;训练模型的时候，我们目标依然是最小化负对数似然：$-logP(w|w_i)$，但是这里我们不需要更新每个单词的向量，只需要更新该路径上经过的节点的向量即可。
178 | &#8195;&#8195;这种方法的速度取决于二叉树的构造方式和单词分配给叶节点的方式。Mikolovlion利用二叉霍夫曼树，其特点是高频词在树中有着更短的路径。
179 | 
180 | <div id="refer"></div>
181 | 
182 | ### References
183 | [Rumelhart et al., 1988] Rumelhart, D. E., Hinton, G. E., and Williams, R. J. (1988).Neurocomputing: Foundations of research. chapter Learning Representations by Back-propagating Errors, pages 696-699. MIT Press, Cambridge, MA, USA.
184 | [Collobert et al., 2011] Collobert, R., Weston, J., Bottou, L., Karlen, M., Kavukcuoglu, K., and Kuksa, P. P. (2011). Natural language processing (almost) from scratch. CoRR, abs/ 1103. 0398.
185 | [Mikolov et al., 2013] Mikolov, T., Chen, K., Corrado, G., and Dean, J. (2013). Efﬁcient estimation of word representations in vector space. CoRR, abs/ 1301. 3781.
186 | 
187 | 


--------------------------------------------------------------------------------
/text_classification/examples/test_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | import pickle
 7 | import time
 8 | import os
 9 | import sys
10 | sys.path.append('..')
11 | 
12 | from utils import data_helper
13 | from models.FastText import FastText
14 | from models.TextCNN import TextCNN
15 | from models.TextRNN import TextRNN
16 | from models.TextBiLSTM import TextBiLSTM
17 | 
18 | FLAGS = tf.app.flags.FLAGS
19 | # Data params
20 | tf.app.flags.DEFINE_string('data_path', '../text_data/input_data/', 'input data path')
21 | # Model params
22 | tf.app.flags.DEFINE_string("filter_sizes", "2,3,4", "textcnn model, convolution filter sizes")
23 | tf.app.flags.DEFINE_integer("num_filters", 2, "textcnn model, convolution filter nums")
24 | tf.app.flags.DEFINE_integer("num_classes", 2, "num_classes")
25 | tf.app.flags.DEFINE_float("keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
26 | tf.app.flags.DEFINE_integer("hidden_num", 2, "Number of RNNCell num")
27 | tf.app.flags.DEFINE_integer("hidden_size", 2, "Number of RNN layers")
28 | # Training params
29 | tf.app.flags.DEFINE_float("learning_rate", 0.01, "learning_rate (default: 0.01)")
30 | tf.app.flags.DEFINE_integer("epochs", 10, "Number of training epochs (default: 10)")
31 | tf.app.flags.DEFINE_integer("batch_size", 512, "Batch Size (default: 64)")
32 | tf.app.flags.DEFINE_integer("checkpoint_every", 100, "Save model every steps (default: 100)")
33 | tf.app.flags.DEFINE_string("checkpoint_dir", './model_save/', "checkpoint_dir")
34 | 
35 | train_x, train_y, valid_x, valid_y, embedding, word2index, index2word, vocab_size, maxlen = data_helper.load_data('../text_data/input_data/')
36 | print(train_x.shape)
37 | print(vocab_size)
38 | print(embedding.shape)
39 | print(embedding.dtype)
40 | print(maxlen)
41 | 
42 | 
43 | # model = FastText(
44 | #     num_classes=FLAGS.num_classes,
45 | #     sequence_length=maxlen,
46 | #     w2v_model_embedding=embedding,
47 | #     vocab_size=vocab_size,
48 | #     embedding_size=200)
49 | 
50 | # model = TextCNN(filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
51 | #                 num_filters=FLAGS.num_filters,
52 | #                 num_classes=FLAGS.num_classes,
53 | #                 sequence_length=maxlen,
54 | #                 w2v_model_embedding=embedding,
55 | #                 vocab_size=vocab_size,
56 | #                 embedding_size=200)
57 | 
58 | # model =TextRNN(num_classes=FLAGS.num_classes,
59 | #                sequence_length=maxlen,
60 | #                w2v_model_embedding=embedding,
61 | #                vocab_size=vocab_size,
62 | #                embedding_size=200,
63 | #                hidden_num=FLAGS.hidden_num,
64 | #                hidden_size=FLAGS.hidden_size,
65 | #                keep_prob=0.5)
66 | 
67 | model =TextBiLSTM(num_classes=FLAGS.num_classes,
68 |                   sequence_length=maxlen,
69 |                   w2v_model_embedding=embedding,
70 |                   vocab_size=vocab_size,
71 |                   embedding_size=200,
72 |                   hidden_num=FLAGS.hidden_num,
73 |                   keep_prob=0.5)
74 | 
75 | optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
76 | model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy')
77 | model.fit(train_x, train_y,
78 |           batch_size=128,
79 |           epochs=2,
80 |           verbose=1,
81 |           valid_x=valid_x,
82 |           valid_y=valid_y,
83 |           )
84 | predict_scores = model.predict(train_x)
85 | print(predict_scores[:5])
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/text_classification/models/BaseModel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | class TextClassifierBaseModel(object):
  8 |     def __init__(self, num_classes=None, sequence_length=None,
  9 |                  w2v_model_embedding=None, vocab_size=None, embedding_size=200,
 10 |                  initializer=tf.random_normal_initializer(stddev=0.1),
 11 |                  l2_reg_lambda=0.001):
 12 |         self.num_classes = num_classes
 13 |         self.sequence_length = sequence_length
 14 |         if w2v_model_embedding is not None:
 15 |             self.w2v_model_embedding = tf.cast(w2v_model_embedding, tf.float32)
 16 |         else:
 17 |             self.w2v_model_embedding = None
 18 |         self.vocab_size = vocab_size
 19 |         self.embedding_size = embedding_size
 20 |         self.initializer = initializer
 21 |         self.l2_reg_lambda = l2_reg_lambda
 22 |         self.l2_loss = tf.constant(0.0)
 23 | 
 24 |         self.input_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='input_x')
 25 |         self.input_y = tf.placeholder(tf.int32, [None, self.num_classes], name='label')
 26 | 
 27 |         self.logits = None
 28 | 
 29 |     def _initialize_embedding(self):
 30 |         with tf.name_scope('embedding'):
 31 |             if self.w2v_model_embedding is None:
 32 |                 self.Embedding = tf.get_variable(name='embedding',
 33 |                                                  shape=[self.vocab_size, self.embedding_size],
 34 |                                                  initializer=self.initializer)  # [vocab_size, embedding_size]
 35 |             else:
 36 |                 self.Embedding = tf.get_variable(name='embedding',
 37 |                                                  initializer=self.w2v_model_embedding,
 38 |                                                  dtype=tf.float32)
 39 | 
 40 |     def _initialize_weights(self):
 41 |         with tf.name_scope('weights'):
 42 |             self.W = tf.get_variable(name='W',
 43 |                                      shape=[self.embedding_size, self.num_classes],
 44 |                                      initializer=self.initializer)
 45 |             self.b = tf.get_variable(name='b', shape=[self.num_classes])
 46 | 
 47 |     def _inference(self):
 48 |         sentence_embedding = tf.nn.embedding_lookup(self.Embedding, self.input_x)
 49 | 
 50 |         self.sentence_embedding = tf.reduce_mean(sentence_embedding, axis=1) # [None, self.embedding_size]
 51 | 
 52 |         with tf.name_scope('output'):
 53 |             logits = tf.matmul(self.sentence_embedding, self.W) + self.b
 54 |         return logits
 55 | 
 56 |     def _loss(self):
 57 |         with tf.name_scope('loss'):
 58 |             self.l2_loss += tf.nn.l2_loss(self.b)
 59 |             losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
 60 |             loss = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss
 61 |         return loss
 62 | 
 63 |     def _accuracy(self):
 64 |         with tf.name_scope('accuracy'):
 65 |             self.prediction = tf.argmax(self.logits, 1, name='prediction')
 66 |             correct_predictions = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.input_y, 1))
 67 |             accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
 68 |         return accuracy
 69 | 
 70 |     def compile(self, optimizer, loss, metrics=None):
 71 |         if loss == 'binary_crossentropy':
 72 |             self.loss= self._loss()
 73 |         if metrics == 'accuracy':
 74 |             self.accuracy = self._accuracy()
 75 |         grads_and_vars = optimizer.compute_gradients(self.loss)
 76 |         self.train_op = optimizer.apply_gradients(grads_and_vars)
 77 | 
 78 |     def _next_batch(self, train_x, train_y=None, epochs=1, batch_size=None, shuffle=True):
 79 |         data_size = len(train_x)
 80 |         num_batches_per_epoch = int(data_size / batch_size) + 1
 81 |         
 82 |         for _ in range(epochs):
 83 |             if shuffle:
 84 |                 shuffle_indices = np.random.permutation(np.arange(data_size))
 85 |                 shuffled_data = train_x[shuffle_indices]
 86 |                 if train_y is not None:
 87 |                     shuffled_data_y = train_y[shuffle_indices]
 88 |             else:
 89 |                 shuffled_data = train_x
 90 |                 if train_y is not None:
 91 |                     shuffled_data_y = train_y
 92 | 
 93 |             for batch_num in range(num_batches_per_epoch):
 94 |                 start_index = batch_num * batch_size
 95 |                 end_index = min((batch_num + 1) * batch_size, data_size)
 96 |                 
 97 |                 if train_y is None:
 98 |                     yield shuffled_data[start_index:end_index]
 99 |                 else:
100 |                     yield shuffled_data[start_index:end_index], shuffled_data_y[start_index:end_index]
101 | 
102 |     def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, valid_x=None, valid_y=None, checkpoint_dir=None):
103 |         config = tf.ConfigProto()
104 |         config.gpu_options.allow_growth = True
105 |         self.sess = tf.Session(config=config)
106 |         if checkpoint_dir:
107 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
108 |             checkpoint_dir = os.path.join(checkpoint_dir, "checkpoints")
109 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
110 | 
111 |             if os.path.exists(checkpoint_dir):
112 |                 print("Restoring Variables from Checkpoint.")
113 |                 saver.restore(self.sess, tf.train.latest_checkpoint(checkpoint_dir))
114 |             else:
115 |                 print('Initializing Variables')
116 |                 self.sess.run(tf.global_variables_initializer())
117 |                 os.makedirs(checkpoint_dir)
118 |         else:
119 |             self.sess.run(tf.global_variables_initializer())
120 | 
121 |         train_step = 0
122 |         for epoch in range(epochs):
123 |             step = 0
124 |             for batch_x, batch_y in self._next_batch(x, y, batch_size=batch_size):
125 |                 feed_dict = {self.input_x: batch_x,
126 |                              self.input_y: batch_y,
127 |                              }
128 |                 self.sess.run([self.loss, self.accuracy, self.train_op], feed_dict)
129 |                 train_step += 1
130 |                 step += 1
131 | 
132 |                 if step % verbose == 0:
133 |                     feed_dict = {self.input_x: batch_x,
134 |                                  self.input_y: batch_y,
135 |                                  }
136 |                     train_loss, train_acc = self.sess.run([self.loss, self.accuracy], feed_dict)
137 | 
138 |                     if valid_x is not None:
139 |                         feed_dict = {self.input_x: valid_x,
140 |                                      self.input_y: valid_y,
141 |                                      }
142 |                         val_loss, val_acc = self.sess.run([self.loss, self.accuracy], feed_dict)
143 |                         print('Epoch {}\tBatch {}\tTrain Loss:{:.4f}\tTrain Acc:{:.4f}\tValid Loss:{:.4f}\tValid Acc:{:.4f}'.format(
144 |                             epoch, step, train_loss, train_acc, val_loss, val_acc))
145 |                     else:
146 |                         print('Epoch {}\tBatch {}\tTrain Loss:{:.4f}\tTrain Acc:{:.4f}'.format(epoch, step, train_loss, train_acc))
147 | 
148 |                 if checkpoint_dir:
149 |                     if train_step % 50 == 0:
150 |                         print("Going to save model..")
151 |                         saver.save(self.sess, checkpoint_prefix, global_step=train_step)
152 | 
153 |     def predict(self, x, batch_size=None, verbose=0, checkpoint_dir=None):
154 |         predict_scores = []
155 |         if not checkpoint_dir:
156 |             sess = self.sess
157 |         else:
158 |             print('Restore model from checkpoint.')
159 |             config = tf.ConfigProto()
160 |             config.gpu_options.allow_growth = True
161 |             sess = tf.Session(config=config)
162 |             saver = tf.train.Saver()
163 |             checkpoint_dir = os.path.join(checkpoint_dir, "checkpoints")
164 |             saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))
165 | 
166 |         if batch_size is None:
167 |             predict_scores = sess.run(self.logits, feed_dict={self.input_x: x})
168 |         else:
169 |             for batch_x in self._next_batch(x, batch_size=batch_size):
170 |                 batch_result = sess.run(self.logits, feed_dict={self.input_x: batch_x})
171 |                 predict_scores += batch_result.tolist()
172 |         
173 |         return np.array(predict_scores)
174 | 
175 | 


--------------------------------------------------------------------------------
/text_classification/models/FastText.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | from .BaseModel import TextClassifierBaseModel
 8 | 
 9 | class FastText(TextClassifierBaseModel):
10 |     def __init__(self, num_classes, sequence_length,
11 |                  w2v_model_embedding, vocab_size, embedding_size,
12 |                  initializer=tf.random_normal_initializer(stddev=0.1),
13 |                  l2_reg_lambda=0.001):
14 |         super(FastText, self).__init__(num_classes=num_classes, sequence_length=sequence_length,
15 |                                        w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size,
16 |                                        initializer=tf.random_normal_initializer(stddev=0.1),
17 |                                        l2_reg_lambda=0.001)
18 | 
19 |         self._initialize_embedding()
20 |         self._initialize_weights()
21 |         self.logits = self._inference()
22 |         print(self.logits)
23 | 
24 |     def _inference(self):
25 |         sentence_embedding = tf.nn.embedding_lookup(self.Embedding, self.input_x)
26 | 
27 |         self.sentence_embedding = tf.reduce_mean(sentence_embedding, axis=1) # [None, self.embedding_size]
28 | 
29 |         with tf.name_scope('output'):
30 |             logits = tf.matmul(self.sentence_embedding, self.W) + self.b
31 |         return logits
32 | 


--------------------------------------------------------------------------------
/text_classification/models/TextBiLSTM.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | from .BaseModel import TextClassifierBaseModel
 7 | 
 8 | class TextBiLSTM(TextClassifierBaseModel):
 9 |     def __init__(self, num_classes, sequence_length,
10 |                  w2v_model_embedding, vocab_size, embedding_size,
11 |                  hidden_num, keep_prob,
12 |                  initializer=tf.random_normal_initializer(stddev=0.1),
13 |                  l2_reg_lambda=0.001):
14 |         super(TextBiLSTM, self).__init__(num_classes=num_classes, sequence_length=sequence_length,
15 |                                          w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size,
16 |                                          initializer=tf.random_normal_initializer(stddev=0.1),
17 |                                          l2_reg_lambda=0.001)
18 | 
19 |         self.hidden_num = hidden_num
20 |         self.keep_prob = keep_prob
21 | 
22 |         self._initialize_embedding()
23 |         self._initialize_weights()
24 |         self.logits = self._inference()
25 | 
26 |     def _initialize_weights(self):
27 |         with tf.name_scope('weights'):
28 |             self.W = tf.get_variable(name='W',
29 |                                      shape=[self.hidden_num * 2, self.num_classes],
30 |                                      initializer=self.initializer)
31 |             self.b = tf.get_variable(name='b', shape=[self.num_classes])
32 | 
33 |     def _inference(self):
34 | 
35 |         self.embedding_words = tf.nn.embedding_lookup(self.Embedding, self.input_x)
36 | 
37 |         rnn_drop = self._bilstm_layer()
38 | 
39 |         with tf.name_scope('output'):
40 |             logits = tf.matmul(rnn_drop, self.W) + self.b
41 |         
42 |         return logits
43 | 
44 |     def _bilstm_layer(self):
45 |         fw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_num, state_is_tuple=True)
46 |         bw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_num, state_is_tuple=True)
47 | 
48 |         with tf.name_scope("dropout"):
49 |             fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=self.keep_prob)
50 |             bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=self.keep_prob)
51 | 
52 |         outputs, _ = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell,
53 |                                                      inputs=self.embedding_words,
54 |                                                      dtype=tf.float32)
55 |         outputs = tf.concat(outputs, axis=2)
56 |         output = tf.reduce_mean(outputs, axis=1)
57 | 
58 |         return output
59 | 


--------------------------------------------------------------------------------
/text_classification/models/TextCNN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | from .BaseModel import TextClassifierBaseModel
 8 | 
 9 | class TextCNN(TextClassifierBaseModel):
10 |     def __init__(self, filter_sizes, num_filters, num_classes, sequence_length,
11 |                  w2v_model_embedding, vocab_size, embedding_size,
12 |                  keep_prob=0.5,
13 |                  initializer=tf.random_normal_initializer(stddev=0.1),
14 |                  l2_reg_lambda=0.001):
15 |         super(TextCNN, self).__init__(num_classes=num_classes, sequence_length=sequence_length,
16 |                                       w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size,
17 |                                       initializer=tf.random_normal_initializer(stddev=0.1),
18 |                                       l2_reg_lambda=0.001)
19 | 
20 |         self.filter_sizes = filter_sizes
21 |         self.num_filters = num_filters
22 |         self.num_filters_total = self.num_filters * len(self.filter_sizes)
23 |         self.keep_prob = keep_prob
24 | 
25 |         self._initialize_embedding()
26 |         self._initialize_weights()
27 |         self.logits = self._inference()
28 | 
29 |     def _initialize_weights(self):
30 |         with tf.name_scope('weights'):
31 |             self.W = tf.get_variable(name='W',
32 |                                      shape=[self.num_filters_total, self.num_classes],
33 |                                      initializer=self.initializer)
34 |             self.b = tf.get_variable(name='b', shape=[self.num_classes])
35 | 
36 |     def _inference(self):
37 |         self.embedding_words = tf.nn.embedding_lookup(self.Embedding, self.input_x)  # [None, sequence_length, embedding_size]
38 |         # [None, sequence_length, embedding_size, 1]. expand dimension so meet input requirement of 2d-conv
39 |         self.sentence_embedding_expanded = tf.expand_dims(self.embedding_words, -1)
40 | 
41 |         conv_out = self._conv_layer()
42 | 
43 |         with tf.name_scope('output'):
44 |             logits = tf.matmul(conv_out, self.W) + self.b
45 |         return logits
46 | 
47 |     def _conv_layer(self):
48 |         pooled_outputs = []
49 |         for i, filter_size in enumerate(self.filter_sizes):
50 |             with tf.variable_scope('convolution-pooling-{}'.format(i)):
51 |                 filter = tf.get_variable(name='filter-{}'.format(filter_size),
52 |                                          shape=[filter_size, self.embedding_size, 1, self.num_filters],
53 |                                          initializer=self.initializer,)
54 |                 # Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]`
55 |                 # Conv.Returns: A `Tensor`. Has the same type as `input`.
56 |                 conv = tf.nn.conv2d(self.sentence_embedding_expanded,
57 |                                     filter,
58 |                                     strides=[1, 1, 1, 1],
59 |                                     padding='VALID',
60 |                                     name='conv')
61 | 
62 |                 b = tf.get_variable(name='b-{}'.format(filter_size), shape=[self.num_filters])
63 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), 'relu')
64 | 
65 |                 pooled = tf.nn.max_pool(h,
66 |                                         ksize=[1, self.sequence_length - filter_size + 1, 1, 1],
67 |                                         strides=[1, 1, 1, 1],
68 |                                         padding='VALID',
69 |                                         name='pool')
70 |                 pooled_outputs.append(pooled)
71 | 
72 |         h_pool = tf.concat(pooled_outputs, 3)
73 |         h_pool_flatten = tf.reshape(h_pool, [-1, self.num_filters_total])
74 | 
75 |         with tf.name_scope('dropout'):
76 |             h_drop = tf.nn.dropout(h_pool_flatten, self.keep_prob)
77 |         
78 |         return h_drop
79 | 


--------------------------------------------------------------------------------
/text_classification/models/TextRCNN.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/text_classification/models/TextRCNN.py


--------------------------------------------------------------------------------
/text_classification/models/TextRNN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | from .BaseModel import TextClassifierBaseModel
 7 | 
 8 | class TextRNN(TextClassifierBaseModel):
 9 |     def __init__(self, num_classes, sequence_length,
10 |                  w2v_model_embedding, vocab_size, embedding_size,
11 |                  hidden_num, hidden_size, keep_prob,
12 |                  initializer=tf.random_normal_initializer(stddev=0.1),
13 |                  l2_reg_lambda=0.001):
14 |         super(TextRNN, self).__init__(num_classes=num_classes, sequence_length=sequence_length,
15 |                                       w2v_model_embedding=w2v_model_embedding, vocab_size=vocab_size, embedding_size=embedding_size,
16 |                                       initializer=tf.random_normal_initializer(stddev=0.1),
17 |                                       l2_reg_lambda=0.001)
18 | 
19 |         self.hidden_num = hidden_num
20 |         self.hidden_size = hidden_size
21 |         self.keep_prob = keep_prob
22 | 
23 |         self._initialize_embedding()
24 |         self._initialize_weights()
25 |         self.logits = self._inference()
26 | 
27 |     def _initialize_weights(self):
28 |         with tf.name_scope('weights'):
29 |             self.W = tf.get_variable(name='W',
30 |                                      shape=[self.hidden_size, self.num_classes],
31 |                                      initializer=self.initializer)
32 |             self.b = tf.get_variable(name='b', shape=[self.num_classes])
33 | 
34 |     def _inference(self):
35 | 
36 |         self.embedding_words = tf.nn.embedding_lookup(self.Embedding, self.input_x)
37 | 
38 |         rnn_drop = self._rnn_layer()
39 | 
40 |         with tf.name_scope('output'):
41 |             logits = tf.matmul(rnn_drop, self.W) + self.b
42 |         
43 |         return logits
44 | 
45 |     def _rnn_layer(self):
46 |         cells = []
47 |         for _ in range(self.hidden_size):
48 |             lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_num, state_is_tuple=True)
49 |             lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
50 |             cells.append(lstm_cell)
51 |         cell = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
52 | 
53 |         outputs, _ = tf.nn.dynamic_rnn(cell,
54 |                                             inputs=self.embedding_words,
55 |                                             dtype=tf.float32)
56 |         outputs = tf.concat(outputs, axis=2)
57 |         output = tf.reduce_mean(outputs, axis=1)
58 | 
59 |         return output
60 | 


--------------------------------------------------------------------------------
/text_classification/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/text_classification/models/__init__.py


--------------------------------------------------------------------------------
/text_classification/online/utils/ckpt2pb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import tensorflow as tf
 5 | from tensorflow.python.framework import graph_util
 6 |   
 7 | def freeze_graph(ckpt_model_dir, output_graph):
 8 |     '''
 9 |     :param input_checkpoint:
10 |     :param output_graph: PB模型保存路径
11 |     :return:
12 |     '''
13 |     checkpoint = tf.train.get_checkpoint_state(ckpt_model_dir)#检查目录下ckpt文件状态是否可用
14 |     if not checkpoint:
15 |         print('dir not')
16 |         exit()
17 |     input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径
18 |  
19 |     # 指定输出的节点名称,该节点名称必须是原模型中存在的节点
20 |     # 直接用最后输出的节点，可以在tensorboard中查找到，tensorboard只能在linux中使用
21 |     output_node_names = "output/add"
22 |     saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
23 |     graph = tf.get_default_graph() # 获得默认的图
24 |     
25 |     input_graph_def = tf.get_default_graph().as_graph_def()
26 | 
27 |     node_names = [n.name for n  in input_graph_def.node]
28 |     for node in node_names:
29 |         print(node)
30 |  
31 |     with tf.Session() as sess:
32 |         saver.restore(sess, input_checkpoint) #恢复图并得到数据
33 |         output_graph_def = graph_util.convert_variables_to_constants(sess=sess, # 模型持久化，将变量值固定
34 |                                                                      input_graph_def=input_graph_def, # 等于:sess.graph_def
35 |                                                                      output_node_names=output_node_names.split(",")) # 如果有多个输出节点，以逗号隔开
36 |  
37 |         with tf.gfile.GFile(output_graph, "wb") as f: #保存模型
38 |             f.write(output_graph_def.SerializeToString()) #序列化输出
39 |         print("%d ops in the final graph." % len(output_graph_def.node)) #得到当前图有几个操作节点
40 |  
41 | input_checkpoint='text_classification/examples/model_save/checkpoints/'
42 | out_pb_path='text_classification/online/pb_model/'
43 | freeze_graph(input_checkpoint, out_pb_path)


--------------------------------------------------------------------------------
/text_classification/online/utils/ckpt2save.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import sys
 5 | import tensorflow as tf
 6 | from tensorflow.python.framework import graph_util
 7 |  
 8 | 
 9 | def freeze_graph(ckpt_model_dir, export_path_base, model_version):
10 |     '''
11 |     :param input_checkpoint:
12 |     :param output_graph: PB模型保存路径
13 |     :return:
14 |     '''
15 |     checkpoint = tf.train.get_checkpoint_state(ckpt_model_dir)#检查目录下ckpt文件状态是否可用
16 |     if not checkpoint:
17 |         print('dir not')
18 |         exit()
19 |     input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径
20 | 
21 |     export_path = os.path.join(tf.compat.as_bytes(export_path_base),
22 |                                tf.compat.as_bytes(str(model_version)))
23 |     print('Exporting trained model to', export_path)
24 |     builder = tf.saved_model.builder.SavedModelBuilder(export_path)
25 |  
26 |     saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
27 | 
28 |     # input_graph_def = tf.get_default_graph().as_graph_def()
29 |     # node_names = [n.name for n in input_graph_def.node]
30 |     # for node in node_names:
31 |     #     print(node)
32 |  
33 |     with tf.Session() as sess:
34 |         saver.restore(sess, input_checkpoint) #恢复图并得到数据
35 |         input_x = sess.graph.get_tensor_by_name('input_x:0')
36 |         output = sess.graph.get_tensor_by_name('output/add:0')
37 | 
38 |         tensor_info_x = tf.saved_model.utils.build_tensor_info(input_x) # 输入
39 |         tensor_info_y = tf.saved_model.utils.build_tensor_info(output) # 输出
40 | 
41 |         prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs={'x': tensor_info_x},
42 |                                                                                       outputs={'y': tensor_info_y},
43 |                                                                                       method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
44 |                                                                                       )
45 | 
46 |         legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
47 |         builder.add_meta_graph_and_variables(sess, 
48 |                                              [tf.saved_model.tag_constants.SERVING],
49 |                                              signature_def_map={'predictions': prediction_signature},
50 |                                              legacy_init_op=legacy_init_op)
51 | 
52 |     builder.save()
53 | 
54 |     print('Done exporting!') 
55 |  
56 | input_checkpoint='text_classification/examples/model_save/checkpoints/'
57 | out_pb_path='text_classification/online/save_model/'
58 | freeze_graph(input_checkpoint, out_pb_path, 1)


--------------------------------------------------------------------------------
/text_classification/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zspo/NLP-Space/1c9891be0f44cf618d5b7fc1be2b03e12ce7fc24/text_classification/utils/__init__.py


--------------------------------------------------------------------------------
/text_classification/utils/data_helper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import sys 
  5 | import pickle
  6 | import gensim
  7 | import numpy as np
  8 | import pandas as pd 
  9 | from sklearn.model_selection import train_test_split
 10 | from keras.preprocessing.text import Tokenizer
 11 | from keras.preprocessing.sequence import pad_sequences
 12 | from keras.utils import to_categorical
 13 | 
 14 | def read_and_process_data(data_path, w2v_model, save_path):
 15 |     train_data = []
 16 |     train_label = []
 17 |     label_map = {'Positive': 1, 'Negative': 0}
 18 |     with open(data_path, 'r', encoding='utf-8') as f:
 19 |         for index, line in enumerate(f.readlines()):
 20 |             if index == 0:
 21 |                 continue
 22 |             line = line.strip().split(',')
 23 |             if len(line) != 3:
 24 |                 continue
 25 |             if line[-1] not in label_map:
 26 |                 continue
 27 |             s_id, content, label = line
 28 |             train_data.append(content)
 29 |             train_label.append(label_map[label])
 30 |     train_label = to_categorical(train_label, num_classes=2)
 31 | 
 32 |     train_x, valid_x, train_y, valid_y = train_test_split(train_data, train_label, test_size=0.15, random_state=2020)
 33 |     
 34 |     maxlen = max([len(c.split(' ')) for c in train_data])
 35 | 
 36 |     ## Tokenize the sentences
 37 |     tokenizer = Tokenizer()
 38 |     tokenizer.fit_on_texts(train_data)
 39 |     word2index = tokenizer.word_index
 40 |     print(len(word2index))
 41 |     embedding = generate_embedding(word2index, w2v_model)
 42 | 
 43 |     train_x = tokenizer.texts_to_sequences(train_x)
 44 |     train_x = pad_sequences(train_x, maxlen=maxlen)
 45 | 
 46 |     valid_x = tokenizer.texts_to_sequences(valid_x)
 47 |     valid_x = pad_sequences(valid_x, maxlen=maxlen)
 48 | 
 49 |     np.save(save_path + 'train_x.npy', train_x)
 50 |     np.save(save_path + 'train_y.npy', train_y)
 51 |     np.save(save_path + 'valid_x.npy', valid_x)
 52 |     np.save(save_path + 'valid_y.npy', valid_y)
 53 |     np.save(save_path + 'embedding.npy', embedding)
 54 | 
 55 |     pickle.dump(word2index, open(save_path + 'word2index.pkl', 'wb'))
 56 | 
 57 |     print('vocab size: {}'.format(len(word2index) + 1))
 58 | 
 59 |     # maxlen = train_x.shape[1]
 60 |     # vocab_size = len(word2index) + 1
 61 |     # index2word = {v: k for k, v in word2index.items()}
 62 | 
 63 | def generate_embedding(word2index, w2v_model):
 64 |     embedding = np.zeros((len(word2index) + 1, 200))
 65 |     for word, index in word2index.items():
 66 |         try:
 67 |             embedding[index] = w2v_model[word]
 68 |         except:
 69 |             continue
 70 |     return embedding
 71 | 
 72 | def load_w2v_model(w2v_model_path):
 73 |     return gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=False)
 74 | 
 75 | def data_process(content):
 76 |     pass
 77 | 
 78 | def filter_stop_words(content, stop_words):
 79 |     pass
 80 |     
 81 | def next_batch(train_x, train_y, batch_size, shuffle=True):
 82 |     data_size = len(train_x)
 83 |     num_batches_per_epoch = int(data_size / batch_size) + 1
 84 |     # while True:
 85 |     if shuffle:
 86 |         shuffle_indices = np.random.permutation(np.arange(data_size))
 87 |         shuffled_data = train_x[shuffle_indices]
 88 |         shuffled_data_y = train_y[shuffle_indices]
 89 |     else:
 90 |         shuffled_data, shuffled_data_y = train_x, train_y
 91 |         
 92 |     for batch_num in range(num_batches_per_epoch):
 93 |         start_index = batch_num * batch_size
 94 |         end_index = min((batch_num + 1) * batch_size, data_size)
 95 | 
 96 |         yield shuffled_data[start_index:end_index], shuffled_data_y[start_index:end_index]
 97 | 
 98 | def load_data(data_path):
 99 |     train_x = np.load(data_path + 'train_x.npy')
100 |     train_y = np.load(data_path + 'train_y.npy')
101 |     valid_x = np.load(data_path + 'valid_x.npy')
102 |     valid_y = np.load(data_path + 'valid_y.npy')
103 |     embedding = np.load(data_path + 'embedding.npy')
104 | 
105 |     word2index = pickle.load(open(data_path + 'word2index.pkl', 'rb'))
106 |     index2word = {v: k for k, v in word2index.items()}
107 |     vocab_size = len(word2index) + 1
108 |     maxlen = len(train_x[0])
109 | 
110 |     return train_x, train_y, valid_x, valid_y, embedding, word2index, index2word, vocab_size, maxlen
111 | 
112 | if __name__ == "__main__":
113 |     data_path = '../text_data/raw_data/train.csv'
114 |     w2v_model_path = '../text_data/w2v_model/text_w2v_model.txt'
115 |     w2v_model = load_w2v_model(w2v_model_path)
116 |     read_and_process_data(data_path, w2v_model, '../text_data/input_data/')


--------------------------------------------------------------------------------
/text_classification/utils/generate_w2v.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import sys
 5 | import time
 6 | import numpy as np
 7 | import gensim
 8 | 
 9 | def load_data(data_path):
10 |     train_data = []
11 |     with open(data_path, 'r', encoding='utf-8') as f:
12 |         for index, line in enumerate(f.readlines()):
13 |             if index == 0:
14 |                 continue
15 |             line = line.strip().split(',')
16 |             if len(line) != 3:
17 |                 continue
18 |             content = line[1]
19 |             train_data.append(content.strip().split())
20 |     return train_data
21 | 
22 | def train_w2v(train_data, model_path):
23 |     start_time = time.time()
24 |     model = gensim.models.Word2Vec(train_data, size=200, window=5, min_count=0, workers=3, iter=10)
25 |     print('train done, time used {:.4f} min.'.format((time.time() - start_time) / 60))
26 |     print(len(model.wv.vocab))
27 |     model.wv.save_word2vec_format(model_path, binary=False)
28 | 
29 | if __name__ == "__main__":
30 |     train_data = load_data('../text_data/raw_data/train.csv')
31 |     print(len(train_data))
32 |     print(train_data[:3])
33 | 
34 |     train_w2v(train_data, '../text_data//w2v_model/text_w2v_model.txt')


--------------------------------------------------------------------------------
/text_matching/esim/ESIM.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from transformers import AutoTokenizer,AutoModel
  4 | from model.SoftAttention import SoftmaxAttention
  5 | 
  6 | 
  7 | ### BERT_ESIM
  8 | ### ESIM部分来自  https://github.com/coetaur0/ESIM 项目
  9 | class ESIM(nn.Module):
 10 | 
 11 |     def __init__(self,bert_path='/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/',
 12 |                     in_size = 768,
 13 |                     hidden_size = 300,
 14 |                     out_size = 2,
 15 |                     dropout = 0.5
 16 |                     ):
 17 |         super().__init__()
 18 |         self.in_size = in_size
 19 |         self.hidden_size = hidden_size
 20 |         self.out_size = out_size
 21 |         self.dropout = dropout
 22 |         
 23 |         ## embedding
 24 |         self.embedding = AutoModel.from_pretrained(bert_path)
 25 |         for i,(name,para) in enumerate(self.embedding.named_parameters()):
 26 |             para.requires_grad = False    ## 冻结作为词向量
 27 |             # print(i,name,para.requires_grad)
 28 |         
 29 |         ## ESIM
 30 |         self.encoding = nn.LSTM(self.in_size,self.hidden_size,bidirectional=True)
 31 |         self.attention = SoftmaxAttention()
 32 |         self.projection = nn.Sequential(nn.Linear(4*2*self.hidden_size,self.hidden_size),  ## 4表示拼接的4个向量/2表示双向
 33 |                                                     nn.ReLU())
 34 |         self.composition = nn.LSTM(self.hidden_size,self.hidden_size,bidirectional=True)
 35 |         self.classifer = nn.Sequential(nn.Dropout(p=self.dropout),
 36 |                                              nn.Linear(2*4*self.hidden_size,
 37 |                                                        self.hidden_size),
 38 |                                              nn.Tanh(),
 39 |                                              nn.Dropout(p=self.dropout),
 40 |                                              nn.Linear(self.hidden_size,
 41 |                                                        self.out_size))
 42 |     
 43 |     def forward(self,
 44 |                 premises,
 45 |                 premise_mask,
 46 |                 premise_seg_ids,
 47 |                 hypotheses,
 48 |                 hypotheses_mask,
 49 |                 hypo_seg_ids,
 50 |                 ):
 51 |         embedded_premises = self.embedding(premises,premise_mask,premise_seg_ids)
 52 |         embedded_premises = embedded_premises[0]
 53 |         # print(embedded_premises)
 54 |         # print(type(embedded_premises))
 55 |         # print(embedded_premises)
 56 |         embedded_hypotheses = self.embedding(hypotheses,hypotheses_mask,hypo_seg_ids)
 57 |         embedded_hypotheses = embedded_hypotheses[0]
 58 |         # print(type(embedded_premises))
 59 |         encoded_premises,_ = self.encoding(embedded_premises)
 60 |         encoded_hypotheses,_ = self.encoding(embedded_hypotheses)
 61 |         # print(type(encoded_premises))
 62 |         # print(len(encoded_premises[0]))
 63 | 
 64 | 
 65 |         attended_premises, attended_hypotheses =\
 66 |             self.attention(encoded_premises, premise_mask,
 67 |                             encoded_hypotheses, hypotheses_mask)
 68 |         
 69 |         enhanced_premises = torch.cat([encoded_premises,
 70 |                                        attended_premises,
 71 |                                        encoded_premises - attended_premises,
 72 |                                        encoded_premises * attended_premises],
 73 |                                        dim=-1)
 74 |         enhanced_hypotheses = torch.cat([encoded_hypotheses,
 75 |                                          attended_hypotheses,
 76 |                                          encoded_hypotheses - attended_hypotheses,
 77 |                                          encoded_hypotheses * attended_hypotheses],
 78 |                                          dim=-1)
 79 |         projected_premises = self.projection(enhanced_premises)
 80 |         projected_hypotheses = self.projection(enhanced_hypotheses)
 81 | 
 82 |         v_ai,_ = self.composition(projected_premises)
 83 |         v_bj,_ = self.composition(projected_hypotheses)
 84 |         
 85 |         # print(v_ai)
 86 |         # print('--------')
 87 |         # print(premise_mask)
 88 |         # print('--------')
 89 |         # print(v_ai.size())
 90 |         # print(premise_mask.size())
 91 |         # print(torch.sum(v_ai * premise_mask.unsqueeze(1).transpose(2, 1), dim=1))
 92 |         v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(1)
 93 |                                                 .transpose(2, 1), dim=1)\
 94 |             / torch.sum(premise_mask, dim=1, keepdim=True)
 95 |         v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
 96 |                                                   .transpose(2, 1), dim=1)\
 97 |             / torch.sum(hypotheses_mask, dim=1, keepdim=True)
 98 |         
 99 |         # v_a_max, _ = replace_masked(v_ai, premise_mask, -1e7).max(dim=1)
100 |         # v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
101 |         v_a_max,_ = v_ai.max(dim=1)
102 |         v_b_max,_ = v_bj.max(dim=1)
103 | 
104 |         v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
105 | 
106 |         logits = self.classifer(v)
107 |         probabilities = nn.functional.softmax(logits, dim=-1)
108 | 
109 |         return logits, probabilities
110 | 
111 | 
112 | if __name__=='__main__':
113 |     tokenizer = AutoTokenizer.from_pretrained('/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/')
114 |     a = tokenizer(['我喜欢北京'])
115 |     input_ids = a['input_ids']
116 |     seg_ids = a['token_type_ids']
117 |     atten_mask = a['attention_mask']
118 | 
119 |     b = tokenizer(['我爱北京'])
120 |     binput_ids = a['input_ids']
121 |     bseg_ids = a['token_type_ids']
122 |     batten_mask = a['attention_mask']
123 |     # seg_ids
124 |     # print(type(a))
125 |     # print(a)
126 |     emodel = ESIM()
127 |     logits,p = emodel(torch.tensor(input_ids),torch.tensor(seg_ids),torch.tensor(atten_mask),
128 |                             torch.tensor(binput_ids),torch.tensor(bseg_ids),torch.tensor(batten_mask))
129 |     
130 |     print(logits)
131 |     print(p)
132 | 


--------------------------------------------------------------------------------
/text_matching/esim/SoftAttention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | class SoftmaxAttention(nn.Module):
  6 |     """
  7 |     Attention layer taking premises and hypotheses encoded by an RNN as input
  8 |     and computing the soft attention between their elements.
  9 | 
 10 |     The dot product of the encoded vectors in the premises and hypotheses is
 11 |     first computed. The softmax of the result is then used in a weighted sum
 12 |     of the vectors of the premises for each element of the hypotheses, and
 13 |     conversely for the elements of the premises.
 14 |     """
 15 | 
 16 |     def forward(self,
 17 |                 premise_batch,
 18 |                 premise_mask,
 19 |                 hypothesis_batch,
 20 |                 hypothesis_mask):
 21 |         """
 22 |         Args:
 23 |             premise_batch: A batch of sequences of vectors representing the
 24 |                 premises in some NLI task. The batch is assumed to have the
 25 |                 size (batch, sequences, vector_dim).
 26 |             premise_mask: A mask for the sequences in the premise batch, to
 27 |                 ignore padding data in the sequences during the computation of
 28 |                 the attention.
 29 |             hypothesis_batch: A batch of sequences of vectors representing the
 30 |                 hypotheses in some NLI task. The batch is assumed to have the
 31 |                 size (batch, sequences, vector_dim).
 32 |             hypothesis_mask: A mask for the sequences in the hypotheses batch,
 33 |                 to ignore padding data in the sequences during the computation
 34 |                 of the attention.
 35 | 
 36 |         Returns:
 37 |             attended_premises: The sequences of attention vectors for the
 38 |                 premises in the input batch.
 39 |             attended_hypotheses: The sequences of attention vectors for the
 40 |                 hypotheses in the input batch.
 41 |         """
 42 |         # Dot product between premises and hypotheses in each sequence of
 43 |         # the batch.
 44 |         similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
 45 |                                                               .contiguous())
 46 | 
 47 |         # Softmax attention weights.
 48 |         prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
 49 |         hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2)
 50 |                                                         .contiguous(),
 51 |                                        premise_mask)
 52 | 
 53 |         # Weighted sums of the hypotheses for the the premises attention,
 54 |         # and vice-versa for the attention of the hypotheses.
 55 |         attended_premises = weighted_sum(hypothesis_batch,
 56 |                                          prem_hyp_attn,
 57 |                                          premise_mask)
 58 |         attended_hypotheses = weighted_sum(premise_batch,
 59 |                                            hyp_prem_attn,
 60 |                                            hypothesis_mask)
 61 | 
 62 |         return attended_premises, attended_hypotheses
 63 | 
 64 | 
 65 | # Code widely inspired from:
 66 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
 67 | def masked_softmax(tensor, mask):
 68 |     """
 69 |     Apply a masked softmax on the last dimension of a tensor.
 70 |     The input tensor and mask should be of size (batch, *, sequence_length).    ## 这与常规的算法不同啊
 71 | 
 72 |     Args:
 73 |         tensor: The tensor on which the softmax function must be applied along
 74 |             the last dimension.
 75 |         mask: A mask of the same size as the tensor with 0s in the positions of
 76 |             the values that must be masked and 1s everywhere else.
 77 | 
 78 |     Returns:
 79 |         A tensor of the same size as the inputs containing the result of the
 80 |         softmax.
 81 |     """
 82 |     tensor_shape = tensor.size()
 83 |     reshaped_tensor = tensor.view(-1, tensor_shape[-1])
 84 | 
 85 |     # Reshape the mask so it matches the size of the input tensor.
 86 |     while mask.dim() < tensor.dim():
 87 |         mask = mask.unsqueeze(1)
 88 |     mask = mask.expand_as(tensor).contiguous().float()
 89 |     reshaped_mask = mask.view(-1, mask.size()[-1])
 90 | 
 91 |     result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1)
 92 |     result = result * reshaped_mask
 93 |     # 1e-13 is added to avoid divisions by zero.
 94 |     result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
 95 | 
 96 |     return result.view(*tensor_shape)
 97 | 
 98 | ### copy from ESIM
 99 | # Code widely inspired from:
100 | # https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
101 | def weighted_sum(tensor, weights, mask):
102 |     """
103 |     Apply a weighted sum on the vectors along the last dimension of 'tensor',
104 |     and mask the vectors in the result with 'mask'.
105 | 
106 |     Args:
107 |         tensor: A tensor of vectors on which a weighted sum must be applied.
108 |         weights: The weights to use in the weighted sum.
109 |         mask: A mask to apply on the result of the weighted sum.
110 | 
111 |     Returns:
112 |         A new tensor containing the result of the weighted sum after the mask
113 |         has been applied on it.
114 |     """
115 |     weighted_sum = weights.bmm(tensor)
116 | 
117 |     while mask.dim() < weighted_sum.dim():
118 |         mask = mask.unsqueeze(1)
119 |     mask = mask.transpose(-1, -2)
120 |     mask = mask.expand_as(weighted_sum).contiguous().float()
121 | 
122 |     return weighted_sum * mask
123 | 
124 | 
125 | # a = torch.rand((2,2,3))
126 | # print(a)
127 | # b = torch.ones((2,2))
128 | # # b[0][0][2] = 0
129 | # b[0][1] = 0
130 | # print(b)
131 | 
132 | # atten = SoftmaxAttention()
133 | # ## batch,*,seq_len
134 | # # a = a.transpose(1,2)
135 | # # b = b.transpose()
136 | # t1,t2 = atten(a,b,a,b)
137 | 
138 | # print(t1.size())
139 | # print(t1)
140 | # print(t2.size())
141 | # print(t2)


--------------------------------------------------------------------------------
/text_matching/sentence-bert/SBERT.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from transformers import AutoTokenizer,AutoModel
 5 | 
 6 | class SBERT(nn.Module):
 7 | 
 8 |     def __init__(self,
 9 |                  bert_path='/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/'
10 |                 ):
11 |         super().__init__()
12 |         
13 |         ## embedding
14 |         self.embedding = AutoModel.from_pretrained(bert_path)
15 | #         for i,(name,para) in enumerate(self.embedding.named_parameters()):
16 | #             para.requires_grad = False    ## 冻结作为词向量
17 |             # print(i,name,para.requires_grad)
18 |         self.metric = nn.CosineSimilarity(dim=1, eps=1e-6)
19 |         self.hidden_size = 768
20 |         
21 |         self.fc = nn.Linear(self.hidden_size * 3, 2)
22 |         
23 |     def forward(self,
24 |                 premises,
25 |                 premise_mask,
26 |                 premise_seg_ids,
27 |                 hypotheses,
28 |                 hypotheses_mask,
29 |                 hypo_seg_ids,
30 |                 inference=False
31 |                 ):
32 |         embedded_premises = self.embedding(premises,premise_mask,premise_seg_ids)
33 |         embedded_premises = embedded_premises[0]
34 | 
35 |         embedded_hypotheses = self.embedding(hypotheses,hypotheses_mask,hypo_seg_ids)
36 |         embedded_hypotheses = embedded_hypotheses[0]
37 |         
38 |         sen_a_len, sen_b_len = (premise_mask != 0).sum(dim=1, keepdim=True), (hypotheses_mask != 0).sum(dim=1, keepdim=True)
39 |         sen_a_pooling, sen_b_pooling = embedded_premises.sum(dim=1) / sen_a_len, embedded_hypotheses.sum(dim=1) / sen_b_len
40 |         
41 |         if inference:
42 |             # sen_a_norm = torch.norm(sen_a_pooling, dim=1)
43 |             # sen_b_norm = torch.norm(sen_b_pooling, dim=1)
44 |             # similarity = (sen_a_pooling * sen_b_pooling).sum(dim=1) / (sen_a_norm * sen_b_norm)
45 |             similarity = F.cosine_similarity(sen_a_pooling, sen_b_pooling, dim=1)
46 |             return similarity
47 |         
48 |         hidden = torch.cat([sen_a_pooling, sen_b_pooling, torch.abs(sen_a_pooling - sen_b_pooling)], dim=1)
49 | 
50 |         return self.fc(hidden)
51 | 
52 | 
53 | if __name__=='__main__':
54 |     tokenizer = AutoTokenizer.from_pretrained('/data1/zsp/PreTrainModelStorage/self_pretrained_bert_11G/')
55 |     a = tokenizer(['我喜欢北京'])
56 |     input_ids = a['input_ids']
57 |     seg_ids = a['token_type_ids']
58 |     atten_mask = a['attention_mask']
59 | 
60 |     b = tokenizer(['另外一个不相关的句子'])
61 |     binput_ids = a['input_ids']
62 |     bseg_ids = a['token_type_ids']
63 |     batten_mask = a['attention_mask']
64 |     # seg_ids
65 |     # print(type(a))
66 |     # print(a)
67 |     emodel = SBERT()
68 |     logits = emodel(torch.tensor(input_ids),torch.tensor(atten_mask),torch.tensor(seg_ids),
69 |                     torch.tensor(binput_ids),torch.tensor(batten_mask),torch.tensor(bseg_ids)
70 |                    )
71 |     
72 |     print(logits)
73 | 


--------------------------------------------------------------------------------
/tiny_transformer/configuration.py:
--------------------------------------------------------------------------------
 1 | class ModelConfig():
 2 |    
 3 |     model_type = "transformer"
 4 | 
 5 |     def __init__(
 6 |         self,
 7 |         vocab_size=20,
 8 |         max_position_embeddings=20,
 9 |         encoder_layer_nums=6,
10 |         decoder_layer_nums=6,
11 |         num_attention_heads=8,
12 |         hidden_size=512,
13 |         intermediate_size=1024,
14 |         hidden_dropout_prob=0.1,
15 |         attention_probs_dropout_prob=0.1,
16 |         type_vocab_size=2,
17 |         initializer_range=0.02,
18 |         layer_norm_eps=1e-6,
19 |         pad_token_id=0,
20 |         **kwargs,
21 |     ):
22 |         super().__init__(**kwargs)
23 | 
24 |         self.vocab_size = vocab_size
25 |         self.max_position_embeddings = max_position_embeddings
26 |         self.encoder_layer_nums = encoder_layer_nums
27 |         self.decoder_layer_nums = decoder_layer_nums
28 |         self.num_attention_heads = num_attention_heads
29 |         self.hidden_size = hidden_size
30 |         self.intermediate_size = intermediate_size
31 |         self.hidden_dropout_prob = hidden_dropout_prob
32 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
33 |         self.type_vocab_size = type_vocab_size
34 |         self.initializer_range = initializer_range
35 |         self.layer_norm_eps = layer_norm_eps
36 |         self.pad_token_id = pad_token_id
37 | 


--------------------------------------------------------------------------------
/tiny_transformer/dataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import random
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | from torch.utils.data import Dataset, DataLoader
 8 | 
 9 | from train_data import sentence_pairs, vocab2id, Tokenizer
10 | from utils import subsequent_mask
11 | 
12 | tok = Tokenizer(vocab2id=vocab2id)
13 | 
14 | class TrainDatasets(Dataset):
15 |     def __init__(self) -> None:
16 |         self.sentence_pairs = sentence_pairs
17 |         self.train_data = []
18 |         for src, tgt in self.sentence_pairs:
19 |             self.train_data.append([tok.encode(src), tok.encode(tgt)])
20 | 
21 |     def __len__(self):
22 |         return len(self.sentence_pairs)
23 |     
24 |     def __getitem__(self, i):
25 |         return self.train_data[i]
26 | 
27 | 
28 | def make_std_mask(tgt, pad):
29 |     "Create a mask to hide padding and future words."
30 |     tgt_mask = (tgt != pad).unsqueeze(-2)
31 |     tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
32 |         tgt_mask.data
33 |     )
34 |     return tgt_mask
35 | 
36 | def data_collator(batch, max_length=20):
37 |     src = []
38 |     tgt = []
39 |     s_max_len = max([len(i[0]) for i in batch])
40 |     t_max_len = max([len(i[1]) for i in batch])
41 |     for s, t in batch:
42 |         s = s + [tok.padding_token_id] * (s_max_len - len(s))
43 |         src.append(s)
44 | 
45 |         t = t + [tok.padding_token_id] * (t_max_len - len(t))
46 |         tgt.append(t)
47 |     
48 |     src = torch.LongTensor(src)
49 |     src_mask = (src != tok.padding_token_id).unsqueeze(-2)
50 |     tgt = torch.LongTensor(tgt)
51 |     tgt = tgt[:, :-1]
52 |     label = tgt[:, 1:]
53 |     tgt_mask = make_std_mask(tgt, tok.padding_token_id)
54 | 
55 |     return {
56 |         "src": src,
57 |         "tgt": tgt,
58 |         "src_mask": src_mask,
59 |         "tgt_mask": tgt_mask,
60 |         "label": label
61 |     }
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     train_dataset = TrainDatasets()
66 |     train_dataloader = DataLoader(
67 |         dataset=train_dataset,
68 |         collate_fn=data_collator,
69 |         batch_size=2
70 |     )
71 | 
72 |     for batch in train_dataloader:
73 |         print(batch)
74 |         break


--------------------------------------------------------------------------------
/tiny_transformer/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | import copy
  5 | import math
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from configuration import ModelConfig
 10 | 
 11 | 
 12 | def clones(module, N):
 13 |     "Produce N identical layers."
 14 |     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
 15 | 
 16 | 
 17 | """
 18 | 六大组件：
 19 |     单词嵌入、位置编码、多头注意力、前馈网络、层归一化、残差连接、
 20 | 三个中间件：
 21 |     编码器、解码器、生成器
 22 | Transformer = 编码器 + 解码器 + 生成器
 23 | """
 24 | 
 25 | 
 26 | class Embedding(nn.Module):
 27 |     def __init__(self, vocab_size, d_model) -> None:
 28 |         super(Embedding, self).__init__()
 29 |         self.embedding = nn.Embedding(vocab_size, d_model)
 30 |         self.d_model = d_model
 31 | 
 32 |     def forward(self, x):
 33 |         return self.embedding(x) * math.sqrt(self.d_model)
 34 | 
 35 | 
 36 | class PositionalEncoding(nn.Module):
 37 |     def __init__(self, d_model, dropout=0.1, max_len=5000) -> None:
 38 |         super(PositionalEncoding, self).__init__()
 39 |         self.dropout = nn.Dropout(dropout)
 40 | 
 41 |         pe = torch.zeros(max_len, d_model)
 42 |         position = torch.arange(0, max_len).unsqueeze(1)
 43 |         div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
 44 | 
 45 |         pe[:, 0::2] = torch.sin(position * div_term)
 46 |         pe[:, 1::2] = torch.cos(position * div_term)
 47 |         pe = pe.unsqueeze(0)
 48 |         self.register_buffer('pe', pe)
 49 | 
 50 |     def forward(self, x):
 51 |         x = x + self.pe[:, :x.size(1)].requires_grad_(False)
 52 |         return self.dropout(x)
 53 | 
 54 | 
 55 | class LayerNorm(nn.Module):
 56 |     def __init__(self, hidden_size, eps=1e-6) -> None:
 57 |         super(LayerNorm, self).__init__()
 58 |         self.a_2 = nn.Parameter(torch.ones(hidden_size))
 59 |         self.b_2 = nn.Parameter(torch.zeros(hidden_size))
 60 |         self.eps = eps
 61 | 
 62 |     def forward(self, x):
 63 |         mean = x.mean(-1, keepdim=True)
 64 |         std = x.std(-1, keepdim=True)
 65 |         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
 66 | 
 67 | 
 68 | class ScaleDotProductAttention(nn.Module):
 69 |     def __init__(self) -> None:
 70 |         super(ScaleDotProductAttention, self).__init__()
 71 | 
 72 |     def forward(self, query, key, value, mask=None, dropout=None):
 73 |         d_k = query.size(-1)
 74 |         scores =torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
 75 |         if mask is not None:
 76 |             scores = scores.masked_fill(mask == 0, -1e9)
 77 |         p_attn = F.softmax(scores, dim=-1)
 78 |         if dropout is not None:
 79 |             p_attn = dropout(p_attn)
 80 |         return torch.matmul(p_attn, value), p_attn
 81 | 
 82 | 
 83 | class MultiHeadAttention(nn.Module):
 84 |     def __init__(self, config: ModelConfig) -> None:
 85 |         super(MultiHeadAttention, self).__init__()
 86 |         self.d_model = config.hidden_size
 87 |         self.h = config.num_attention_heads
 88 | 
 89 |         assert self.d_model % self.h == 0
 90 |         self.d_k = self.d_model // self.h
 91 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 92 |         self.attn = None
 93 |         self.attention = ScaleDotProductAttention()
 94 | 
 95 |     def forward(self, query, key, value, mask=None):
 96 |         if mask is not None:
 97 |             mask = mask.unsqueeze(1)
 98 |         nbatches = query.size(0)
 99 | 
100 |         # 1) Do all the linear projections in batch from d_model => h x d_k
101 |         query = nn.Linear(self.d_model, self.d_model)(query).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
102 |         key = nn.Linear(self.d_model, self.d_model)(key).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
103 |         value = nn.Linear(self.d_model, self.d_model)(value).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
104 |     
105 |         x, self.attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
106 | 
107 |         x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
108 | 
109 |         del query, key, value
110 | 
111 |         return nn.Linear(self.d_model, self.d_model)(x)
112 | 
113 | 
114 | class PositionwiseFeedForward(nn.Module):
115 |     def __init__(self, config: ModelConfig) -> None:
116 |         super(PositionwiseFeedForward, self).__init__()
117 |         self.w_1 = nn.Linear(config.hidden_size, config.intermediate_size)
118 |         self.w_2 = nn.Linear(config.intermediate_size, config.hidden_size)
119 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
120 | 
121 |     def forward(self, x):
122 |         return self.w_2(self.dropout(F.relu(self.w_1(x))))
123 | 
124 | 
125 | class ResidualConnection(nn.Module):
126 |     def __init__(self, config: ModelConfig) -> None:
127 |         super(ResidualConnection, self).__init__()
128 |         self.norm = LayerNorm(config.hidden_size)
129 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
130 | 
131 |     def forward(self, x, sublayer):
132 |         return x + self.dropout(sublayer(self.norm(x)))
133 | 
134 | 
135 | class EncoderLayer(nn.Module):
136 |     def __init__(self, config: ModelConfig) -> None:
137 |         super(EncoderLayer, self).__init__()
138 |         self.self_attn = MultiHeadAttention(config)
139 |         self.feed_forward = PositionwiseFeedForward(config)
140 |         self.sublayer = clones(ResidualConnection(config), 2)
141 | 
142 |     def forward(self, x, mask):
143 |         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
144 |         return self.sublayer[1](x, self.feed_forward)
145 | 
146 | 
147 | class Encoder(nn.Module):
148 |     def __init__(self, config: ModelConfig) -> None:
149 |         super(Encoder, self).__init__()
150 |         self.encoder_layer = EncoderLayer(config)
151 |         self.layers = clones(self.encoder_layer, config.encoder_layer_nums)
152 |         self.norm = LayerNorm(config.hidden_size)
153 | 
154 |     def forward(self, x, mask):
155 |         for layer in self.layers:
156 |             x = layer(x, mask)
157 |         return self.norm(x)
158 | 
159 | 
160 | class DecoderLayer(nn.Module):
161 |     def __init__(self, config: ModelConfig) -> None:
162 |         super(DecoderLayer, self).__init__()
163 |         self.self_attn = MultiHeadAttention(config)
164 |         self.src_attn = MultiHeadAttention(config)
165 |         self.feed_forward = PositionwiseFeedForward(config)
166 |         self.sublayer = clones(ResidualConnection(config), 3)
167 | 
168 |     def forward(self, x, memory, src_mask, tgt_mask):
169 |         m = memory
170 |         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
171 |         x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
172 |         return self.sublayer[2](x, self.feed_forward)
173 | 
174 | 
175 | class Decoder(nn.Module):
176 |     def __init__(self, config: ModelConfig) -> None:
177 |         super(Decoder, self).__init__()
178 |         self.decoder_layer = DecoderLayer(config)
179 |         self.layers = clones(self.decoder_layer, config.decoder_layer_nums)
180 |         self.norm = LayerNorm(config.hidden_size)
181 | 
182 |     def forward(self, x, memory, src_mask, tgt_mask):
183 |         for layer in self.layers:
184 |             x = layer(x, memory, src_mask, tgt_mask)
185 |         return self.norm(x)
186 | 
187 | 
188 | class Generator(nn.Module):
189 |     def __init__(self, config: ModelConfig) -> None:
190 |         super(Generator, self).__init__()
191 |         self.proj = nn.Linear(config.hidden_size, config.vocab_size)
192 | 
193 |     def forward(self, x):
194 |         return F.log_softmax(self.proj(x), dim=-1)
195 | 
196 | 
197 | class Transformer(nn.Module):
198 |     def __init__(self, config) -> None:
199 |         super(Transformer, self).__init__()
200 |         self.encoder = Encoder(config)
201 |         self.decoder = Decoder(config)
202 |         self.src_embed = nn.Sequential(
203 |             Embedding(config.vocab_size, config.hidden_size),
204 |             PositionalEncoding(config.hidden_size, max_len=config.max_position_embeddings)
205 |         )
206 |         self.tgt_embed = nn.Sequential(
207 |             Embedding(config.vocab_size, config.hidden_size), 
208 |             PositionalEncoding(config.hidden_size, max_len=config.max_position_embeddings)
209 |         )
210 |         self.generator = Generator(config)
211 | 
212 |         for p in self.parameters():
213 |             if p.dim() > 1:
214 |                 torch.nn.init.xavier_uniform_(p)
215 | 
216 |     def forward(self, src, tgt, src_mask, tgt_mask, label=None):
217 |         return self.generator(self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask))
218 | 
219 |     def encode(self, src, src_mask):
220 |         return self.encoder(self.src_embed(src), src_mask)
221 | 
222 |     def decode(self, memory, src_mask, tgt, tgt_mask):
223 |         return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
224 | 


--------------------------------------------------------------------------------
/tiny_transformer/test_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from configuration import ModelConfig
 4 | from model import Transformer
 5 | 
 6 | from utils import subsequent_mask
 7 | 
 8 | config = ModelConfig(encoder_layer_nums=2, decoder_layer_nums=2)
 9 | print(config.__dict__)
10 | 
11 | 
12 | def build_model():
13 |     model = Transformer(config)
14 |     # This was important from their code.
15 |     # Initialize parameters with Glorot / fan_avg.
16 |     for p in model.parameters():
17 |         if p.dim() > 1:
18 |             torch.nn.init.xavier_uniform_(p)
19 | 
20 |     return model
21 | 
22 | 
23 | def inference_test():
24 |     test_model = build_model()
25 |     test_model.eval()
26 |     src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
27 |     src_mask = torch.ones(1, 1, 10)
28 |     memory = test_model.encode(src, src_mask)
29 |     ys = torch.zeros(1, 1).type_as(src)
30 |     for i in range(9):
31 |         out = test_model.decode(
32 |             memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
33 |         )
34 |         prob = test_model.generator(out[:, -1])
35 |         _, next_word = torch.max(prob, dim=1)
36 |         next_word = next_word.data[0]
37 |         ys = torch.cat(
38 |             [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
39 |         )
40 |     print("Example Untrained Model Prediction:", ys)
41 | 
42 | 
43 | def run_tests():
44 |     for _ in range(10):
45 |         inference_test()
46 | 
47 | 
48 | run_tests()
49 | 


--------------------------------------------------------------------------------
/tiny_transformer/tests/dataset.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from torch.utils.data import Dataset,DataLoader
  6 | 
  7 | # 定义字典
  8 | words_x = '<PAD>,1,2,3,4,5,6,7,8,9,0,<SOS>,<EOS>,+'
  9 | vocab_x = {word: i for i, word in enumerate(words_x.split(','))}
 10 | vocab_xr = [k for k, v in vocab_x.items()] #反查词典
 11 | 
 12 | words_y = '<PAD>,1,2,3,4,5,6,7,8,9,0,<SOS>,<EOS>'
 13 | vocab_y = {word: i for i, word in enumerate(words_y.split(','))}
 14 | vocab_yr = [k for k, v in vocab_y.items()] #反查词典
 15 | 
 16 | #两数相加数据集
 17 | def get_data():
 18 |     # 定义词集合
 19 |     words = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
 20 | 
 21 |     # 每个词被选中的概率
 22 |     p = np.array([7, 5, 5, 7, 6, 5, 7, 6, 5, 7])
 23 |     p = p / p.sum()
 24 | 
 25 |     # 随机采样n1个词作为s1
 26 |     n1 = random.randint(10, 20)
 27 |     s1 = np.random.choice(words, size=n1, replace=True, p=p)
 28 |     s1 = s1.tolist()
 29 | 
 30 |     # 随机采样n2个词作为s2
 31 |     n2 = random.randint(10, 20)
 32 |     s2 = np.random.choice(words, size=n2, replace=True, p=p)
 33 |     s2 = s2.tolist()
 34 | 
 35 |     # x等于s1和s2字符上的相加
 36 |     x = s1 + ['+'] + s2
 37 |     
 38 |     # y等于s1和s2数值上的相加
 39 |     y = int(''.join(s1)) + int(''.join(s2))
 40 |     y = list(str(y))
 41 |     
 42 |     # 加上首尾符号
 43 |     x = ['<SOS>'] + x + ['<EOS>']
 44 |     y = ['<SOS>'] + y + ['<EOS>']
 45 | 
 46 |     # 补pad到固定长度
 47 |     x = x + ['<PAD>'] * 50
 48 |     y = y + ['<PAD>'] * 51
 49 |     x = x[:50]
 50 |     y = y[:51]
 51 | 
 52 |     # 编码成token
 53 |     token_x = [vocab_x[i] for i in x]
 54 |     token_y = [vocab_y[i] for i in y]
 55 | 
 56 |     # 转tensor
 57 |     tensor_x = torch.LongTensor(token_x)
 58 |     tensor_y = torch.LongTensor(token_y)
 59 |     return tensor_x, tensor_y
 60 | 
 61 | 
 62 | def show_data(tensor_x,tensor_y) ->"str":
 63 |     words_x = "".join([vocab_xr[i] for i in tensor_x.tolist()])
 64 |     words_y = "".join([vocab_yr[i] for i in tensor_y.tolist()])
 65 |     return words_x,words_y
 66 | 
 67 | 
 68 | # 定义数据集
 69 | class TwoSumDataset(torch.utils.data.Dataset):
 70 |     def __init__(self,size = 100000):
 71 |         super(Dataset, self).__init__()
 72 |         self.size = size
 73 | 
 74 |     def __len__(self):
 75 |         return self.size
 76 | 
 77 |     def __getitem__(self, i):
 78 |         return get_data()
 79 |     
 80 | ds_train = TwoSumDataset(size = 100000)
 81 | ds_val = TwoSumDataset(size = 10000)
 82 | 
 83 | 
 84 | if __name__ == "__main__":
 85 | 
 86 |     x,y = get_data() 
 87 |     print(x,y,"\n")
 88 |     print(show_data(x,y)) 
 89 | 
 90 |     # 数据加载器
 91 |     dl_train = DataLoader(dataset=ds_train,
 92 |             batch_size=200,
 93 |             drop_last=True,
 94 |             shuffle=True)
 95 | 
 96 |     dl_val = DataLoader(dataset=ds_val,
 97 |             batch_size=200,
 98 |             drop_last=True,
 99 |             shuffle=False)
100 |     
101 |     for src,tgt in dl_train:
102 |         print(src.shape)
103 |         print(tgt.shape)
104 |         break 
105 | 


--------------------------------------------------------------------------------
/tiny_transformer/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import torch
 4 | from torch.optim.lr_scheduler import LambdaLR
 5 | 
 6 | from configuration import ModelConfig
 7 | from model import Transformer
 8 | 
 9 | from trainer import Trainer, TrainerArgs
10 | 
11 | from dataset import tok, TrainDatasets, data_collator
12 | 
13 | config = ModelConfig()
14 | config.vocab_size = tok.vocab_size
15 | print(config.vocab_size)
16 | model = Transformer(config=config)
17 | optimizer = torch.optim.Adam(
18 |     model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
19 | )
20 | def rate(step, model_size, factor, warmup):
21 |     """
22 |     we have to default the step to 1 for LambdaLR function
23 |     to avoid zero raising to negative power.
24 |     """
25 |     if step == 0:
26 |         step = 1
27 |     return factor * (
28 |         model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
29 |     )
30 | lr_scheduler = LambdaLR(
31 |     optimizer=optimizer,
32 |     lr_lambda=lambda step: rate(
33 |         step, model_size=model.src_embed[0].d_model, factor=1.0, warmup=400
34 |     ),
35 | )
36 | 
37 | args = TrainerArgs()
38 | trainer = Trainer(
39 |     model=model,
40 |     tokenizer=tok,
41 |     args=args,
42 |     data_collator=data_collator,
43 |     train_dataset=TrainDatasets(),
44 |     optimizers=(optimizer, lr_scheduler))
45 | trainer.train()


--------------------------------------------------------------------------------
/tiny_transformer/train_data.py:
--------------------------------------------------------------------------------
 1 | # 批量翻译
 2 | sentence_pairs = [
 3 |     ['je pars en vacances pour quelques jours .', 'i m taking a couple of days off .'],
 4 |     ['je ne me panique pas .', 'i m not panicking .'],
 5 |     ['je recherche un assistant .', 'i am looking for an assistant .'],
 6 |     ['je suis loin de chez moi .', 'i m a long way from home .'],
 7 |     ['vous etes en retard .', 'you re very late .'],
 8 |     ['j ai soif .', 'i am thirsty .'],
 9 |     ['je suis fou de vous .', 'i m crazy about you .'],
10 |     ['vous etes vilain .', 'you are naughty .'],
11 |     ['il est vieux et laid .', 'he s old and ugly .'],
12 |     ['je suis terrifiee .', 'i m terrified .'],
13 | ]
14 | 
15 | import numpy as np
16 | test_data_list = [
17 |     " ".join([str(i) for i in list(np.random.randint(10,size=10))])
18 |     for _ in range(20)
19 | ]
20 | sentence_pairs = [
21 |     [i, i] for i in test_data_list
22 | ]
23 | 
24 | all_words = []
25 | for x, y in sentence_pairs:
26 |     all_words.extend(x.split())
27 |     all_words.extend(y.split())
28 | all_words = sorted(list(set(all_words)))
29 | vocab2id = {word: i for i, word in enumerate(all_words)}
30 | id2vocab = {i: word for word, i in vocab2id.items()}
31 | print(vocab2id)
32 | 
33 | class Tokenizer():
34 |     def __init__(self, vocab2id) -> None:
35 |         self.vocab_size = len(vocab2id) + 1
36 |         self.padding_token_id = len(vocab2id)
37 |         self.vocab2id = vocab2id
38 |         self.id2vocab = {i: word for word, i in self.vocab2id.items()}
39 | 
40 |     def encode(self, sentences):
41 |         ids = [self.vocab2id[w] for w in sentences.split()]
42 |         return ids
43 |     
44 |     def decode(self, ids):
45 |         return " ".join([self.id2vocab[i] for i in ids])
46 | 
47 | if __name__ == "__main__":
48 |     tok = Tokenizer(vocab2id=vocab2id)
49 |     s = sentence_pairs[0][0]
50 |     print(s)
51 |     ids = tok.encode(s)
52 |     print(ids)
53 |     print(tok.decode(ids))


--------------------------------------------------------------------------------
/tiny_transformer/trainer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import copy
  4 | import functools
  5 | import os
  6 | import sys
  7 | import random
  8 | import time
  9 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | import torch.nn as nn
 14 | from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler
 15 | 
 16 | 
 17 | 
 18 | class LabelSmoothingLoss(nn.Module):
 19 |     "Implement label smoothing."
 20 |     def __init__(self, size, padding_idx, smoothing=0.0):
 21 |         super(LabelSmoothingLoss, self).__init__()
 22 |         self.criterion = nn.KLDivLoss(reduction="sum")
 23 |         self.padding_idx = padding_idx
 24 |         self.confidence = 1.0 - smoothing
 25 |         self.smoothing = smoothing
 26 |         self.size = size
 27 |         self.true_dist = None
 28 |         
 29 |     def forward(self, x, target):
 30 |         assert x.size(1) == self.size
 31 |         true_dist = x.data.clone()
 32 |         true_dist.fill_(self.smoothing / (self.size - 2))
 33 |         true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
 34 |         true_dist[:, self.padding_idx] = 0
 35 |         mask = torch.nonzero((target.data == self.padding_idx).int())
 36 |         if mask.dim() > 0:
 37 |             true_dist.index_fill_(0, mask.squeeze(), 0.0)
 38 |         self.true_dist = true_dist
 39 |         return self.criterion(x, true_dist)
 40 | 
 41 | 
 42 | class TrainerArgs:
 43 |     epochs: int = 10
 44 |     learning_rate: float = 1e-4
 45 |     train_batch_size: int = 2
 46 |     eval_batch_size: int = 2
 47 |     gradient_accumulation_steps: int = 1
 48 |     evaluation_steps: int = 1000
 49 |     logging_steps: int = 1
 50 |     save_steps: int = 1000
 51 |     device: str = "cuda" if torch.cuda.is_available() else "cpu"
 52 | 
 53 | 
 54 | class Trainer:
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         model,
 59 |         tokenizer,
 60 |         args: TrainerArgs,
 61 |         data_collator = None,
 62 |         train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
 63 |         eval_dataset: Optional[Union[Dataset, IterableDataset]] = None,
 64 |         loss_fn: nn.Module = None,
 65 |         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None)
 66 |     ):
 67 |         self.model = model
 68 |         self.args = args
 69 |         self.data_collator = data_collator
 70 |         self.train_dataset = train_dataset
 71 |         self.eval_dataset = eval_dataset
 72 |         if loss_fn is None:
 73 |             self.loss_fn = LabelSmoothingLoss(tokenizer.vocab_size, tokenizer.padding_token_id)
 74 |         else:
 75 |             self.loss_fn = loss_fn
 76 |         self.optimizer, self.lr_scheduler = optimizers
 77 | 
 78 |     def get_data_loader(self, dataset, batch_size, shuffle=True) -> DataLoader:
 79 |         dataloader_params = {
 80 |             "batch_size": batch_size,
 81 |             "collate_fn": self.data_collator,
 82 |             "sampler": RandomSampler(dataset) if shuffle else SequentialSampler(dataset),
 83 |         }
 84 |         return DataLoader(dataset, **dataloader_params)
 85 | 
 86 |     def compute_loss(self, output, tgt):
 87 |         if self.loss_fn is not None:
 88 |             return self.loss_fn(output, tgt)
 89 |         else:
 90 |             return nn.CrossEntropyLoss()(output, tgt)
 91 |     
 92 |     def training_step(self, model, batch):
 93 |         model.train()
 94 |         output = model.forward(**batch)
 95 |         output = output.reshape(-1, output.size(-1))
 96 |         labels = batch["label"].reshape(-1)
 97 |         loss = self.compute_loss(output, labels)
 98 |         print(loss)
 99 | 
100 |         del batch
101 |         torch.cuda.empty_cache()
102 | 
103 |         loss.backward()
104 | 
105 |         return loss.detach() / self.args.gradient_accumulation_steps
106 |     
107 |     def evaluation_loop(self, model, dataloader):
108 |         model.eval()
109 | 
110 |         total_loss = 0
111 |         for step, batch in enumerate(dataloader):
112 |             with torch.no_grad():
113 |                 output = model.forward(**batch)
114 |                 loss = self.compute_loss(output, batch["label"])
115 |                 total_loss += loss.item()
116 | 
117 |                 del batch
118 |                 torch.cuda.empty_cache()
119 | 
120 |         return total_loss / len(dataloader)
121 | 
122 |     def train(self):
123 | 
124 |         start_time = time.time()
125 |         tr_loss = torch.tensor(0.0).to(self.args.device)
126 |         total_steps = 0
127 |         self.model.zero_grad()
128 | 
129 |         self.train_dataloader = self.get_data_loader(self.train_dataset, self.args.train_batch_size)
130 |         if self.eval_dataset is not None:
131 |             self.eval_dataloader = self.get_data_loader(self.eval_dataset, self.args.eval_batch_size, shuffle=False)
132 | 
133 |         for epoch in range(self.args.epochs):
134 |             for batch in self.train_dataloader:
135 |                 tr_loss_step = self.training_step(self.model, batch)
136 |                 tr_loss += tr_loss_step
137 | 
138 |                 if (total_steps + 1) % self.args.gradient_accumulation_steps == 0:
139 |                     self.optimizer.step()
140 |                     self.optimizer.zero_grad()
141 |                     self.lr_scheduler.step()
142 |                 self.model.zero_grad()
143 | 
144 |                 total_steps += 1
145 | 
146 |                 if total_steps % self.args.logging_steps == 0:
147 |                     train_loss = tr_loss.item() / total_steps
148 |                     print(f"Epoch {epoch} | Steps {total_steps} | Loss: {train_loss} | Time: {time.time() - start_time}")
149 | 
150 |                 if self.eval_dataset and total_steps % self.args.evaluation_steps == 0:
151 |                     eval_loss = self.evaluation_loop(self.model, self.eval_dataloader)
152 |                     print(f"Validation Loss: {eval_loss} | Time: {time.time() - start_time}")
153 | 


--------------------------------------------------------------------------------
/tiny_transformer/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | def subsequent_mask(size):
4 |     "Mask out subsequent positions."
5 |     attn_shape = (1, size, size)
6 |     subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
7 |         torch.uint8
8 |     )
9 |     return subsequent_mask == 0


--------------------------------------------------------------------------------