├── .gitignore
├── 1.Emotional analysis+lstm+pytorch.ipynb
├── 2.使用Bert预训练模型微调中文文本分类.ipynb
├── 3.Pytorch+seq2seq机器翻译模型+attention+英翻中.ipynb
├── Bert手写版本+MLM+NSP.ipynb
├── README.md
└── nmt
├── en-cn
├── cmn.txt
├── dev.txt
├── dev_mini.txt
├── test.txt
├── test_mini.txt
├── train.txt
└── train_mini.txt
└── en-fr
├── _about.txt
└── fra.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/1.Emotional analysis+lstm+pytorch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "lstm.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyPPNzC0+L5ZLvET9UfvEjd1",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "metadata": {
32 | "id": "1X9yjs5_7z4p",
33 | "colab_type": "code",
34 | "colab": {
35 | "base_uri": "https://localhost:8080/",
36 | "height": 800
37 | },
38 | "outputId": "cb78ac54-4c37-4fff-ee10-6780fba2a77c"
39 | },
40 | "source": [
41 | "!pip install torch\n",
42 | "!pip install torchtext\n",
43 | "!python -m spacy download en\n",
44 | "!pip install torchvision\n",
45 | "\n",
46 | "# K80 gpu for 12 hours\n",
47 | "import torch\n",
48 | "from torch import nn, optim\n",
49 | "from torchtext import data, datasets\n",
50 | "\n",
51 | "print('GPU:', torch.cuda.is_available())\n",
52 | "\n",
53 | "torch.manual_seed(123)"
54 | ],
55 | "execution_count": 1,
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "text": [
60 | "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (1.5.1+cu101)\n",
61 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch) (0.16.0)\n",
62 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch) (1.18.5)\n",
63 | "Requirement already satisfied: torchtext in /usr/local/lib/python3.6/dist-packages (0.3.1)\n",
64 | "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchtext) (1.5.1+cu101)\n",
65 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from torchtext) (2.23.0)\n",
66 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from torchtext) (4.41.1)\n",
67 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchtext) (1.18.5)\n",
68 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->torchtext) (0.16.0)\n",
69 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (2020.6.20)\n",
70 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (2.10)\n",
71 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (1.24.3)\n",
72 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (3.0.4)\n",
73 | "Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in /usr/local/lib/python3.6/dist-packages (2.2.5)\n",
74 | "Requirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.6/dist-packages (from en_core_web_sm==2.2.5) (2.2.4)\n",
75 | "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.18.5)\n",
76 | "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n",
77 | "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.7.1)\n",
78 | "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.23.0)\n",
79 | "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n",
80 | "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n",
81 | "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n",
82 | "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)\n",
83 | "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)\n",
84 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (49.1.0)\n",
85 | "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)\n",
86 | "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)\n",
87 | "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.41.1)\n",
88 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n",
89 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n",
90 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n",
91 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.24.3)\n",
92 | "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.7.0)\n",
93 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)\n",
94 | "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
95 | "You can now load the model via spacy.load('en_core_web_sm')\n",
96 | "\u001b[38;5;2m✔ Linking successful\u001b[0m\n",
97 | "/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n",
98 | "/usr/local/lib/python3.6/dist-packages/spacy/data/en\n",
99 | "You can now load the model via spacy.load('en')\n",
100 | "Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages (0.6.1+cu101)\n",
101 | "Requirement already satisfied: torch==1.5.1 in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.5.1+cu101)\n",
102 | "Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision) (7.0.0)\n",
103 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.18.5)\n",
104 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch==1.5.1->torchvision) (0.16.0)\n",
105 | "GPU: True\n"
106 | ],
107 | "name": "stdout"
108 | },
109 | {
110 | "output_type": "execute_result",
111 | "data": {
112 | "text/plain": [
113 | ""
114 | ]
115 | },
116 | "metadata": {
117 | "tags": []
118 | },
119 | "execution_count": 1
120 | }
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "metadata": {
126 | "id": "4HovvE-c8BeQ",
127 | "colab_type": "code",
128 | "colab": {
129 | "base_uri": "https://localhost:8080/",
130 | "height": 136
131 | },
132 | "outputId": "55007511-700b-400f-890b-afacefedf08f"
133 | },
134 | "source": [
135 | "# 为CPU设置随机种子\n",
136 | "torch.manual_seed(123)\n",
137 | "\n",
138 | "# 两个Field对象定义字段的处理方法(文本字段、标签字段)\n",
139 | "TEXT = data.Field(tokenize='spacy') # 分词\n",
140 | "LABEL = data.LabelField(dtype=torch.float)\n",
141 | "\n",
142 | "# IMDB共50000影评,包含正面和负面两个类别。数据被前面的Field处理\n",
143 | "# 按照(TEXT, LABEL) 分割成 训练集,测试集\n",
144 | "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n",
145 | "\n",
146 | "print('len of train data:', len(train_data)) # 25000\n",
147 | "print('len of test data:', len(test_data)) # 25000\n",
148 | "\n",
149 | "# torchtext.data.Example : 用来表示一个样本,数据+标签\n",
150 | "print(train_data.examples[15].text) # 文本:句子的单词列表\n",
151 | "print(train_data.examples[15].label) # 标签: 积极"
152 | ],
153 | "execution_count": 3,
154 | "outputs": [
155 | {
156 | "output_type": "stream",
157 | "text": [
158 | "downloading aclImdb_v1.tar.gz\n"
159 | ],
160 | "name": "stdout"
161 | },
162 | {
163 | "output_type": "stream",
164 | "text": [
165 | "aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.0MB/s]\n"
166 | ],
167 | "name": "stderr"
168 | },
169 | {
170 | "output_type": "stream",
171 | "text": [
172 | "len of train data: 25000\n",
173 | "len of test data: 25000\n",
174 | "['The', 'movie', 'is', 'a', 'bit', '\"', 'thin', '\"', 'after', 'reading', 'the', 'book', ',', 'but', 'it', \"'s\", 'still', 'one', 'of', 'the', 'greatest', 'movies', 'ever', 'made', '.', 'Sheryl', 'Lee', 'is', 'beautiful', 'and', 'Nick', 'Nolte', 'is', 'really', '\"', 'vonneguty', '\"', '.', 'He', 'makes', 'great', 'job', 'expressing', 'the', 'feelings', 'from', 'the', 'book', 'to', 'the', 'film', '.', 'Not', 'many', 'films', 'engage', 'the', 'feeling', 'of', 'the', 'book', 'as', 'well', 'as', 'Mother', 'Night', 'does', '.']\n",
175 | "pos\n"
176 | ],
177 | "name": "stdout"
178 | }
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "metadata": {
184 | "id": "HbKax6528BuX",
185 | "colab_type": "code",
186 | "colab": {
187 | "base_uri": "https://localhost:8080/",
188 | "height": 100
189 | },
190 | "outputId": "994a91d4-891f-4082-f9ba-b98026572c22"
191 | },
192 | "source": [
193 | "TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')\n",
194 | "LABEL.build_vocab(train_data)\n",
195 | "print(len(TEXT.vocab)) # 10002\n",
196 | "print(TEXT.vocab.itos[:12]) # ['', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']\n",
197 | "print(TEXT.vocab.stoi['and']) # 5\n",
198 | "print(LABEL.vocab.stoi) # defaultdict(None, {'neg': 0, 'pos': 1})"
199 | ],
200 | "execution_count": 4,
201 | "outputs": [
202 | {
203 | "output_type": "stream",
204 | "text": [
205 | ".vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s] \n",
206 | "100%|█████████▉| 398894/400000 [00:16<00:00, 23519.32it/s]"
207 | ],
208 | "name": "stderr"
209 | },
210 | {
211 | "output_type": "stream",
212 | "text": [
213 | "10002\n",
214 | "['', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']\n",
215 | "5\n",
216 | "defaultdict(, {'neg': 0, 'pos': 1})\n"
217 | ],
218 | "name": "stdout"
219 | }
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "metadata": {
225 | "id": "vfv4g5Si8B7I",
226 | "colab_type": "code",
227 | "colab": {}
228 | },
229 | "source": [
230 | "batchsz = 30\n",
231 | "device = torch.device('cuda')\n",
232 | "train_iterator, test_iterator = data.BucketIterator.splits(\n",
233 | " (train_data, test_data),\n",
234 | " batch_size = batchsz,\n",
235 | " device=device\n",
236 | " )"
237 | ],
238 | "execution_count": 18,
239 | "outputs": []
240 | },
241 | {
242 | "cell_type": "code",
243 | "metadata": {
244 | "id": "Ei2LDRY18B4y",
245 | "colab_type": "code",
246 | "colab": {}
247 | },
248 | "source": [
249 | "class RNN(nn.Module):\n",
250 | "\n",
251 | " def __init__(self, vocab_size, embedding_dim, hidden_dim):\n",
252 | " super(RNN, self).__init__()\n",
253 | "\n",
254 | " # [0-10001] => [100]\n",
255 | " # 参数1:embedding个数(单词数), 参数2:embedding的维度(词向量维度)\n",
256 | " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
257 | " # [100] => [256]\n",
258 | " # 双向LSTM,所以下面FC层使用 hidden_dim*2\n",
259 | " self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,\n",
260 | " bidirectional=True, dropout=0.5) \n",
261 | " # [256*2] => [1]\n",
262 | " self.fc = nn.Linear(hidden_dim*2, 1)\n",
263 | " self.dropout = nn.Dropout(0.5)\n",
264 | "\n",
265 | " def forward(self, x):\n",
266 | " \"\"\"\n",
267 | " x: [seq_len, b] vs [b, 3, 28, 28]\n",
268 | " \"\"\"\n",
269 | " # [seq_len, b, 1] => [seq_len, b, 100]\n",
270 | " embedding = self.dropout(self.embedding(x))\n",
271 | "\n",
272 | " # output: [seq, b, hid_dim*2]\n",
273 | " # hidden/h: [num_layers*2, b, hid_dim]\n",
274 | " # cell/c: [num_layers*2, b, hid_dim]\n",
275 | " output, (hidden, cell) = self.rnn(embedding)\n",
276 | " # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]\n",
277 | " # 双向,所以要把最后两个输出连接\n",
278 | " hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)\n",
279 | " # [b, hid_dim*2] => [b, 1]\n",
280 | " hidden = self.dropout(hidden)\n",
281 | " out = self.fc(hidden)\n",
282 | "\n",
283 | " return out"
284 | ],
285 | "execution_count": 6,
286 | "outputs": []
287 | },
288 | {
289 | "cell_type": "code",
290 | "metadata": {
291 | "id": "Hb7WYVEmCYqG",
292 | "colab_type": "code",
293 | "colab": {
294 | "base_uri": "https://localhost:8080/",
295 | "height": 50
296 | },
297 | "outputId": "a2ebe150-3aff-4880-a4bc-06f1ada66856"
298 | },
299 | "source": [
300 | "rnn = RNN(len(TEXT.vocab), 100, 256) #词个数,词嵌入维度,输出维度\n",
301 | "\n",
302 | "pretrained_embedding = TEXT.vocab.vectors\n",
303 | "print('pretrained_embedding:', pretrained_embedding.shape) # torch.Size([10002, 100])\n",
304 | "\n",
305 | "# 使用预训练过的embedding来替换随机初始化\n",
306 | "rnn.embedding.weight.data.copy_(pretrained_embedding)\n",
307 | "print('embedding layer inited.')"
308 | ],
309 | "execution_count": 19,
310 | "outputs": [
311 | {
312 | "output_type": "stream",
313 | "text": [
314 | "pretrained_embedding: torch.Size([10002, 100])\n",
315 | "embedding layer inited.\n"
316 | ],
317 | "name": "stdout"
318 | }
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {
324 | "id": "k_OIamPtJP3L",
325 | "colab_type": "text"
326 | },
327 | "source": [
328 | ""
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "metadata": {
334 | "id": "rbnpMTyhCYny",
335 | "colab_type": "code",
336 | "colab": {
337 | "base_uri": "https://localhost:8080/",
338 | "height": 116
339 | },
340 | "outputId": "8342c342-2c54-488d-e780-eb882657cdac"
341 | },
342 | "source": [
343 | "optimizer = optim.Adam(rnn.parameters(), lr=1e-3)\n",
344 | "# BCEWithLogitsLoss是针对二分类的CrossEntropy\n",
345 | "criteon = nn.BCEWithLogitsLoss().to(device)\n",
346 | "rnn.to(device)"
347 | ],
348 | "execution_count": 20,
349 | "outputs": [
350 | {
351 | "output_type": "execute_result",
352 | "data": {
353 | "text/plain": [
354 | "RNN(\n",
355 | " (embedding): Embedding(10002, 100)\n",
356 | " (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)\n",
357 | " (fc): Linear(in_features=512, out_features=1, bias=True)\n",
358 | " (dropout): Dropout(p=0.5, inplace=False)\n",
359 | ")"
360 | ]
361 | },
362 | "metadata": {
363 | "tags": []
364 | },
365 | "execution_count": 20
366 | }
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "metadata": {
372 | "id": "4UXwTad6CYlR",
373 | "colab_type": "code",
374 | "colab": {}
375 | },
376 | "source": [
377 | "import numpy as np \n",
378 | "def binary_acc(preds, y):\n",
379 | "\n",
380 | " preds = torch.round(torch.sigmoid(preds))\n",
381 | " correct = torch.eq(preds, y).float()\n",
382 | " acc = correct.sum() / len(correct)\n",
383 | " return acc"
384 | ],
385 | "execution_count": 27,
386 | "outputs": []
387 | },
388 | {
389 | "cell_type": "code",
390 | "metadata": {
391 | "id": "yXMx5OVwCYjL",
392 | "colab_type": "code",
393 | "colab": {}
394 | },
395 | "source": [
396 | "def train(rnn, iterator, optimizer, criteon):\n",
397 | " avg_acc = []\n",
398 | " rnn.train() # 表示进入训练模式\n",
399 | "\n",
400 | " for i, batch in enumerate(iterator):\n",
401 | " # [seq, b] => [b, 1] => [b]\n",
402 | " # batch.text 就是上面forward函数的参数text,压缩维度是为了和batch.label维度一致\n",
403 | " pred = rnn(batch.text).squeeze(1)\n",
404 | "\n",
405 | " loss = criteon(pred, batch.label)\n",
406 | " # 计算每个batch的准确率\n",
407 | " acc = binary_acc(pred, batch.label).item()\n",
408 | " avg_acc.append(acc)\n",
409 | "\n",
410 | " optimizer.zero_grad() # 清零梯度准备计算\n",
411 | " loss.backward() # 反向传播\n",
412 | " optimizer.step() # 更新训练参数\n",
413 | "\n",
414 | " if i % 10 == 0:\n",
415 | " print(i, acc)\n",
416 | "\n",
417 | " avg_acc = np.array(avg_acc).mean()\n",
418 | " print('avg acc:', avg_acc)\n",
419 | "\n"
420 | ],
421 | "execution_count": 22,
422 | "outputs": []
423 | },
424 | {
425 | "cell_type": "code",
426 | "metadata": {
427 | "id": "XRhMMq0rCYg7",
428 | "colab_type": "code",
429 | "colab": {}
430 | },
431 | "source": [
432 | "def evaluate(rnn, iterator, criteon):\n",
433 | " avg_acc = []\n",
434 | " rnn.eval() # 表示进入测试模式\n",
435 | "\n",
436 | " with torch.no_grad():\n",
437 | " for batch in iterator:\n",
438 | " pred = rnn(batch.text).squeeze(1) # [b, 1] => [b]\n",
439 | " loss = criteon(pred, batch.label)\n",
440 | " acc = binary_acc(pred, batch.label).item()\n",
441 | " avg_acc.append(acc)\n",
442 | "\n",
443 | " avg_acc = np.array(avg_acc).mean()\n",
444 | "\n",
445 | " print('test acc:', avg_acc)"
446 | ],
447 | "execution_count": 23,
448 | "outputs": []
449 | },
450 | {
451 | "cell_type": "code",
452 | "metadata": {
453 | "id": "diKF7CfZCYd4",
454 | "colab_type": "code",
455 | "colab": {
456 | "base_uri": "https://localhost:8080/",
457 | "height": 1000
458 | },
459 | "outputId": "8c356ec4-70b9-4a6f-88b9-23172fa0df0b"
460 | },
461 | "source": [
462 | "for epoch in range(10):\n",
463 | " \n",
464 | " train(rnn, train_iterator, optimizer, criteon)\n",
465 | " \n",
466 | " evaluate(rnn, test_iterator, criteon)"
467 | ],
468 | "execution_count": 28,
469 | "outputs": [
470 | {
471 | "output_type": "stream",
472 | "text": [
473 | "0 0.8666667342185974\n",
474 | "10 0.9666666984558105\n",
475 | "20 0.8000000715255737\n",
476 | "30 0.8666667342185974\n",
477 | "40 0.8666667342185974\n",
478 | "50 0.8000000715255737\n",
479 | "60 0.9333333969116211\n",
480 | "70 0.7666667103767395\n",
481 | "80 0.9000000357627869\n",
482 | "90 0.8666667342185974\n",
483 | "100 0.9000000357627869\n",
484 | "110 0.7666667103767395\n",
485 | "120 0.8000000715255737\n",
486 | "130 0.9666666984558105\n",
487 | "140 0.8666667342185974\n",
488 | "150 0.9000000357627869\n",
489 | "160 0.9000000357627869\n",
490 | "170 0.9000000357627869\n",
491 | "180 0.8000000715255737\n",
492 | "190 0.8000000715255737\n",
493 | "200 0.9333333969116211\n",
494 | "210 0.9000000357627869\n",
495 | "220 0.9333333969116211\n",
496 | "230 0.8666667342185974\n",
497 | "240 0.9000000357627869\n",
498 | "250 0.7666667103767395\n",
499 | "260 0.9333333969116211\n",
500 | "270 0.9000000357627869\n",
501 | "280 0.8000000715255737\n",
502 | "290 0.8666667342185974\n",
503 | "300 0.9333333969116211\n",
504 | "310 0.7666667103767395\n",
505 | "320 0.9000000357627869\n",
506 | "330 0.9666666984558105\n",
507 | "340 0.9666666984558105\n",
508 | "350 0.8333333730697632\n",
509 | "360 0.9000000357627869\n",
510 | "370 0.8000000715255737\n",
511 | "380 0.9000000357627869\n",
512 | "390 0.8666667342185974\n",
513 | "400 0.8333333730697632\n",
514 | "410 0.9000000357627869\n",
515 | "420 0.9333333969116211\n",
516 | "430 0.8333333730697632\n",
517 | "440 0.8666667342185974\n",
518 | "450 0.8000000715255737\n",
519 | "460 0.9333333969116211\n",
520 | "470 0.8666667342185974\n",
521 | "480 0.9333333969116211\n",
522 | "490 0.9333333969116211\n",
523 | "500 0.9000000357627869\n",
524 | "510 0.8333333730697632\n",
525 | "520 0.8666667342185974\n",
526 | "530 0.9333333969116211\n",
527 | "540 0.9333333969116211\n",
528 | "550 0.7666667103767395\n",
529 | "560 0.8333333730697632\n",
530 | "570 0.9333333969116211\n",
531 | "580 0.9000000357627869\n",
532 | "590 0.9333333969116211\n",
533 | "600 0.9000000357627869\n",
534 | "610 0.8333333730697632\n",
535 | "620 0.7333333492279053\n",
536 | "630 0.8333333730697632\n",
537 | "640 0.8333333730697632\n",
538 | "650 0.9000000357627869\n",
539 | "660 0.9333333969116211\n",
540 | "670 0.8000000715255737\n",
541 | "680 0.9000000357627869\n",
542 | "690 0.9000000357627869\n",
543 | "700 0.9000000357627869\n",
544 | "710 0.9333333969116211\n",
545 | "720 0.8000000715255737\n",
546 | "730 0.9333333969116211\n",
547 | "740 0.9666666984558105\n",
548 | "750 0.9666666984558105\n",
549 | "760 0.9333333969116211\n",
550 | "770 0.8666667342185974\n",
551 | "780 0.8666667342185974\n",
552 | "790 0.8666667342185974\n",
553 | "800 0.9666666984558105\n",
554 | "810 0.9000000357627869\n",
555 | "820 0.9000000357627869\n",
556 | "830 0.9333333969116211\n",
557 | "avg acc: 0.8855715916454078\n",
558 | "test acc: 0.8775779855051201\n",
559 | "0 0.9000000357627869\n",
560 | "10 0.9666666984558105\n",
561 | "20 0.9000000357627869\n",
562 | "30 0.9000000357627869\n",
563 | "40 0.9666666984558105\n",
564 | "50 0.9666666984558105\n",
565 | "60 0.7666667103767395\n",
566 | "70 0.8666667342185974\n",
567 | "80 0.9333333969116211\n",
568 | "90 0.9000000357627869\n",
569 | "100 0.9333333969116211\n",
570 | "110 0.8666667342185974\n",
571 | "120 0.9000000357627869\n",
572 | "130 0.9000000357627869\n",
573 | "140 0.8666667342185974\n",
574 | "150 0.8333333730697632\n",
575 | "160 0.8333333730697632\n",
576 | "170 0.9333333969116211\n",
577 | "180 0.8333333730697632\n",
578 | "190 0.9000000357627869\n",
579 | "200 0.8666667342185974\n",
580 | "210 1.0\n",
581 | "220 1.0\n",
582 | "230 0.9666666984558105\n",
583 | "240 0.9000000357627869\n",
584 | "250 0.8000000715255737\n",
585 | "260 0.9333333969116211\n",
586 | "270 0.9666666984558105\n",
587 | "280 0.9333333969116211\n",
588 | "290 0.9666666984558105\n",
589 | "300 0.9000000357627869\n",
590 | "310 0.9333333969116211\n",
591 | "320 0.9333333969116211\n",
592 | "330 0.9666666984558105\n",
593 | "340 0.9666666984558105\n",
594 | "350 0.9666666984558105\n",
595 | "360 0.9333333969116211\n",
596 | "370 0.9666666984558105\n",
597 | "380 0.8333333730697632\n",
598 | "390 0.7333333492279053\n",
599 | "400 0.9000000357627869\n",
600 | "410 0.9000000357627869\n",
601 | "420 0.8000000715255737\n",
602 | "430 0.9333333969116211\n",
603 | "440 0.8666667342185974\n",
604 | "450 0.9333333969116211\n",
605 | "460 0.8333333730697632\n",
606 | "470 0.9333333969116211\n",
607 | "480 0.9333333969116211\n",
608 | "490 0.8000000715255737\n",
609 | "500 0.9666666984558105\n",
610 | "510 0.9000000357627869\n",
611 | "520 1.0\n",
612 | "530 0.9666666984558105\n",
613 | "540 1.0\n",
614 | "550 0.9333333969116211\n",
615 | "560 0.9000000357627869\n",
616 | "570 1.0\n",
617 | "580 0.9000000357627869\n",
618 | "590 0.9000000357627869\n",
619 | "600 0.8666667342185974\n",
620 | "610 0.8333333730697632\n",
621 | "620 0.9000000357627869\n",
622 | "630 0.9000000357627869\n",
623 | "640 0.8666667342185974\n",
624 | "650 0.9000000357627869\n",
625 | "660 0.9666666984558105\n",
626 | "670 0.9333333969116211\n",
627 | "680 0.8666667342185974\n",
628 | "690 0.9000000357627869\n",
629 | "700 0.8666667342185974\n",
630 | "710 0.9333333969116211\n",
631 | "720 0.9666666984558105\n",
632 | "730 0.9666666984558105\n",
633 | "740 0.9666666984558105\n",
634 | "750 0.9000000357627869\n",
635 | "760 0.9000000357627869\n",
636 | "770 0.9000000357627869\n",
637 | "780 0.9333333969116211\n",
638 | "790 0.9333333969116211\n",
639 | "800 0.9333333969116211\n",
640 | "810 0.8666667342185974\n",
641 | "820 0.9000000357627869\n",
642 | "830 0.9000000357627869\n",
643 | "avg acc: 0.9071942910873633\n",
644 | "test acc: 0.8886890964542361\n",
645 | "0 0.9333333969116211\n",
646 | "10 0.9333333969116211\n",
647 | "20 0.9666666984558105\n",
648 | "30 0.9333333969116211\n",
649 | "40 0.9333333969116211\n",
650 | "50 0.8666667342185974\n",
651 | "60 1.0\n",
652 | "70 0.8333333730697632\n",
653 | "80 0.9666666984558105\n",
654 | "90 0.9000000357627869\n",
655 | "100 0.9666666984558105\n",
656 | "110 0.9666666984558105\n",
657 | "120 0.9333333969116211\n",
658 | "130 0.9333333969116211\n",
659 | "140 0.9000000357627869\n",
660 | "150 0.9666666984558105\n",
661 | "160 0.8666667342185974\n",
662 | "170 0.9666666984558105\n",
663 | "180 0.9666666984558105\n",
664 | "190 0.9333333969116211\n",
665 | "200 0.9333333969116211\n",
666 | "210 0.8666667342185974\n",
667 | "220 0.9000000357627869\n",
668 | "230 0.8333333730697632\n",
669 | "240 0.9333333969116211\n",
670 | "250 0.8000000715255737\n",
671 | "260 0.8666667342185974\n",
672 | "270 0.9000000357627869\n",
673 | "280 0.9000000357627869\n",
674 | "290 0.9666666984558105\n",
675 | "300 0.9333333969116211\n",
676 | "310 0.9000000357627869\n",
677 | "320 0.9333333969116211\n",
678 | "330 0.9666666984558105\n",
679 | "340 0.9000000357627869\n",
680 | "350 1.0\n",
681 | "360 0.9666666984558105\n",
682 | "370 0.9333333969116211\n",
683 | "380 0.9333333969116211\n",
684 | "390 0.9666666984558105\n",
685 | "400 0.9666666984558105\n",
686 | "410 0.9666666984558105\n",
687 | "420 1.0\n",
688 | "430 0.9000000357627869\n",
689 | "440 1.0\n",
690 | "450 0.9000000357627869\n",
691 | "460 0.9333333969116211\n",
692 | "470 1.0\n",
693 | "480 0.9000000357627869\n",
694 | "490 0.9333333969116211\n",
695 | "500 0.9000000357627869\n",
696 | "510 0.9000000357627869\n",
697 | "520 0.9333333969116211\n",
698 | "530 0.9333333969116211\n",
699 | "540 0.9666666984558105\n",
700 | "550 0.9666666984558105\n",
701 | "560 0.9666666984558105\n",
702 | "570 0.9666666984558105\n",
703 | "580 0.8333333730697632\n",
704 | "590 0.9666666984558105\n",
705 | "600 0.9333333969116211\n",
706 | "610 0.9333333969116211\n",
707 | "620 0.9333333969116211\n",
708 | "630 1.0\n",
709 | "640 0.9000000357627869\n",
710 | "650 0.8666667342185974\n",
711 | "660 0.9333333969116211\n",
712 | "670 0.8666667342185974\n",
713 | "680 0.9666666984558105\n",
714 | "690 0.9333333969116211\n",
715 | "700 1.0\n",
716 | "710 0.9666666984558105\n",
717 | "720 0.9666666984558105\n",
718 | "730 0.9000000357627869\n",
719 | "740 0.9333333969116211\n",
720 | "750 0.9666666984558105\n",
721 | "760 1.0\n",
722 | "770 0.8666667342185974\n",
723 | "780 0.9000000357627869\n",
724 | "790 0.9333333969116211\n",
725 | "800 0.9666666984558105\n",
726 | "810 0.9000000357627869\n",
727 | "820 0.9666666984558105\n",
728 | "830 0.8000000715255737\n",
729 | "avg acc: 0.9266587171337302\n",
730 | "test acc: 0.8872902161068768\n",
731 | "0 0.9333333969116211\n",
732 | "10 1.0\n",
733 | "20 1.0\n",
734 | "30 0.9666666984558105\n",
735 | "40 0.9666666984558105\n",
736 | "50 1.0\n",
737 | "60 0.9333333969116211\n",
738 | "70 0.9666666984558105\n",
739 | "80 0.8666667342185974\n",
740 | "90 0.9666666984558105\n",
741 | "100 0.9333333969116211\n",
742 | "110 0.8666667342185974\n",
743 | "120 0.9333333969116211\n",
744 | "130 0.9000000357627869\n",
745 | "140 0.8333333730697632\n",
746 | "150 0.9666666984558105\n",
747 | "160 0.9666666984558105\n",
748 | "170 0.8666667342185974\n",
749 | "180 0.9666666984558105\n",
750 | "190 0.9666666984558105\n",
751 | "200 0.9333333969116211\n",
752 | "210 0.9333333969116211\n",
753 | "220 0.9666666984558105\n",
754 | "230 0.9666666984558105\n",
755 | "240 0.9000000357627869\n",
756 | "250 1.0\n",
757 | "260 0.9333333969116211\n",
758 | "270 0.9666666984558105\n",
759 | "280 0.9333333969116211\n",
760 | "290 0.9000000357627869\n",
761 | "300 1.0\n",
762 | "310 0.9333333969116211\n",
763 | "320 0.9666666984558105\n",
764 | "330 0.9666666984558105\n",
765 | "340 0.9333333969116211\n",
766 | "350 0.9333333969116211\n",
767 | "360 0.9333333969116211\n",
768 | "370 0.9333333969116211\n",
769 | "380 1.0\n",
770 | "390 1.0\n",
771 | "400 0.9333333969116211\n",
772 | "410 1.0\n",
773 | "420 0.9333333969116211\n",
774 | "430 0.9666666984558105\n",
775 | "440 0.9333333969116211\n",
776 | "450 0.9333333969116211\n",
777 | "460 0.9666666984558105\n",
778 | "470 0.8333333730697632\n",
779 | "480 1.0\n",
780 | "490 0.9333333969116211\n",
781 | "500 0.9666666984558105\n",
782 | "510 0.9000000357627869\n",
783 | "520 0.9000000357627869\n",
784 | "530 1.0\n",
785 | "540 0.9333333969116211\n",
786 | "550 0.9666666984558105\n",
787 | "560 0.9000000357627869\n",
788 | "570 0.9333333969116211\n",
789 | "580 0.9333333969116211\n",
790 | "590 0.9666666984558105\n",
791 | "600 0.8333333730697632\n",
792 | "610 0.9333333969116211\n",
793 | "620 0.8666667342185974\n",
794 | "630 0.9000000357627869\n",
795 | "640 0.9333333969116211\n",
796 | "650 0.9666666984558105\n",
797 | "660 0.9666666984558105\n",
798 | "670 0.9333333969116211\n",
799 | "680 0.9333333969116211\n",
800 | "690 0.9333333969116211\n",
801 | "700 0.9666666984558105\n",
802 | "710 0.9000000357627869\n",
803 | "720 0.9333333969116211\n",
804 | "730 1.0\n",
805 | "740 0.9666666984558105\n",
806 | "750 0.9333333969116211\n",
807 | "760 0.9666666984558105\n",
808 | "770 0.8333333730697632\n",
809 | "780 0.9666666984558105\n",
810 | "790 0.9000000357627869\n",
811 | "800 0.9000000357627869\n",
812 | "810 0.9000000357627869\n",
813 | "820 0.9666666984558105\n",
814 | "830 0.9666666984558105\n",
815 | "avg acc: 0.9356515197445163\n",
816 | "test acc: 0.890008042184569\n",
817 | "0 1.0\n",
818 | "10 1.0\n",
819 | "20 0.9000000357627869\n",
820 | "30 0.8666667342185974\n",
821 | "40 0.9000000357627869\n",
822 | "50 0.9333333969116211\n",
823 | "60 0.9000000357627869\n",
824 | "70 0.9666666984558105\n",
825 | "80 0.8666667342185974\n",
826 | "90 0.9000000357627869\n",
827 | "100 0.9333333969116211\n",
828 | "110 1.0\n",
829 | "120 0.9666666984558105\n",
830 | "130 0.9666666984558105\n",
831 | "140 1.0\n",
832 | "150 0.9333333969116211\n",
833 | "160 0.9333333969116211\n",
834 | "170 0.9333333969116211\n",
835 | "180 1.0\n",
836 | "190 0.9666666984558105\n",
837 | "200 0.9333333969116211\n",
838 | "210 1.0\n",
839 | "220 0.9666666984558105\n",
840 | "230 1.0\n",
841 | "240 0.9333333969116211\n",
842 | "250 0.8333333730697632\n",
843 | "260 0.9666666984558105\n",
844 | "270 0.9333333969116211\n",
845 | "280 0.9000000357627869\n",
846 | "290 1.0\n",
847 | "300 0.9666666984558105\n",
848 | "310 0.9333333969116211\n",
849 | "320 0.9000000357627869\n",
850 | "330 0.9000000357627869\n",
851 | "340 1.0\n",
852 | "350 0.9666666984558105\n",
853 | "360 1.0\n",
854 | "370 0.9666666984558105\n",
855 | "380 0.9000000357627869\n",
856 | "390 0.9666666984558105\n",
857 | "400 0.9666666984558105\n",
858 | "410 0.9333333969116211\n",
859 | "420 0.9000000357627869\n",
860 | "430 1.0\n",
861 | "440 0.9333333969116211\n",
862 | "450 0.9666666984558105\n",
863 | "460 0.9666666984558105\n",
864 | "470 1.0\n",
865 | "480 1.0\n",
866 | "490 0.9666666984558105\n",
867 | "500 1.0\n",
868 | "510 1.0\n",
869 | "520 1.0\n",
870 | "530 1.0\n",
871 | "540 0.8666667342185974\n",
872 | "550 1.0\n",
873 | "560 0.9333333969116211\n",
874 | "570 0.9333333969116211\n",
875 | "580 0.9666666984558105\n",
876 | "590 0.9666666984558105\n",
877 | "600 0.9333333969116211\n",
878 | "610 0.9000000357627869\n",
879 | "620 0.9333333969116211\n",
880 | "630 0.9666666984558105\n",
881 | "640 0.9666666984558105\n",
882 | "650 0.9333333969116211\n",
883 | "660 0.9333333969116211\n",
884 | "670 0.9000000357627869\n",
885 | "680 0.9333333969116211\n",
886 | "690 0.9000000357627869\n",
887 | "700 0.9333333969116211\n",
888 | "710 0.9666666984558105\n",
889 | "720 0.9666666984558105\n",
890 | "730 0.9333333969116211\n",
891 | "740 0.9333333969116211\n",
892 | "750 1.0\n",
893 | "760 0.9666666984558105\n",
894 | "770 0.9333333969116211\n",
895 | "780 0.9333333969116211\n",
896 | "790 0.9000000357627869\n",
897 | "800 1.0\n",
898 | "810 0.9000000357627869\n",
899 | "820 1.0\n",
900 | "830 0.9000000357627869\n",
901 | "avg acc: 0.9450040338136595\n",
902 | "test acc: 0.8848521674422624\n",
903 | "0 1.0\n",
904 | "10 1.0\n",
905 | "20 0.9666666984558105\n",
906 | "30 0.9666666984558105\n",
907 | "40 1.0\n",
908 | "50 1.0\n",
909 | "60 0.9666666984558105\n",
910 | "70 1.0\n",
911 | "80 0.9666666984558105\n",
912 | "100 0.9666666984558105\n",
913 | "110 0.9666666984558105\n",
914 | "120 0.9333333969116211\n",
915 | "130 0.9666666984558105\n",
916 | "140 0.9666666984558105\n",
917 | "150 1.0\n",
918 | "160 0.9666666984558105\n",
919 | "170 1.0\n",
920 | "180 1.0\n",
921 | "190 0.9666666984558105\n",
922 | "200 0.8666667342185974\n",
923 | "210 1.0\n",
924 | "220 0.8666667342185974\n",
925 | "230 0.9666666984558105\n",
926 | "240 0.9333333969116211\n",
927 | "250 0.8333333730697632\n",
928 | "260 0.9666666984558105\n",
929 | "270 0.9666666984558105\n",
930 | "280 0.9000000357627869\n",
931 | "290 0.9666666984558105\n",
932 | "300 0.9666666984558105\n",
933 | "310 0.9333333969116211\n",
934 | "320 1.0\n",
935 | "330 0.9666666984558105\n",
936 | "340 0.9666666984558105\n",
937 | "350 0.9333333969116211\n",
938 | "360 0.9000000357627869\n",
939 | "370 0.8666667342185974\n",
940 | "380 0.9333333969116211\n",
941 | "390 0.8333333730697632\n",
942 | "400 0.9666666984558105\n",
943 | "410 1.0\n",
944 | "420 0.9666666984558105\n",
945 | "430 0.9666666984558105\n",
946 | "440 1.0\n",
947 | "450 0.9666666984558105\n",
948 | "460 0.9333333969116211\n",
949 | "470 1.0\n",
950 | "480 0.9666666984558105\n",
951 | "490 1.0\n",
952 | "500 0.9666666984558105\n",
953 | "510 0.9333333969116211\n",
954 | "520 0.8666667342185974\n",
955 | "530 0.9666666984558105\n",
956 | "540 1.0\n",
957 | "550 1.0\n",
958 | "560 0.9333333969116211\n",
959 | "570 0.9333333969116211\n",
960 | "580 1.0\n",
961 | "590 0.9666666984558105\n",
962 | "600 0.9666666984558105\n",
963 | "610 0.9666666984558105\n",
964 | "620 0.9666666984558105\n",
965 | "630 0.9666666984558105\n",
966 | "640 0.9333333969116211\n",
967 | "650 0.9000000357627869\n",
968 | "660 0.9333333969116211\n",
969 | "670 1.0\n",
970 | "680 0.9333333969116211\n",
971 | "690 0.9666666984558105\n",
972 | "700 0.9333333969116211\n",
973 | "710 1.0\n",
974 | "720 0.9333333969116211\n",
975 | "730 1.0\n",
976 | "740 0.9666666984558105\n",
977 | "750 0.9666666984558105\n",
978 | "760 0.8666667342185974\n",
979 | "770 0.9000000357627869\n",
980 | "780 0.8000000715255737\n",
981 | "790 0.9666666984558105\n",
982 | "800 0.9666666984558105\n",
983 | "810 0.8666667342185974\n",
984 | "820 1.0\n",
985 | "830 0.9666666984558105\n",
986 | "avg acc: 0.9509592677334802\n",
987 | "test acc: 0.8718625588668621\n",
988 | "0 1.0\n",
989 | "10 1.0\n",
990 | "20 0.9666666984558105\n",
991 | "30 0.9333333969116211\n",
992 | "40 1.0\n",
993 | "50 0.9666666984558105\n",
994 | "60 0.9666666984558105\n",
995 | "70 0.9666666984558105\n",
996 | "80 1.0\n",
997 | "90 0.9333333969116211\n",
998 | "100 1.0\n",
999 | "110 0.9666666984558105\n",
1000 | "120 0.9666666984558105\n",
1001 | "130 0.9666666984558105\n",
1002 | "140 0.9666666984558105\n",
1003 | "150 0.9666666984558105\n",
1004 | "160 0.9666666984558105\n",
1005 | "170 1.0\n",
1006 | "180 0.9666666984558105\n",
1007 | "190 0.9000000357627869\n",
1008 | "200 1.0\n",
1009 | "210 1.0\n",
1010 | "220 0.9333333969116211\n",
1011 | "230 1.0\n",
1012 | "240 0.9666666984558105\n",
1013 | "250 1.0\n",
1014 | "260 0.9666666984558105\n",
1015 | "270 0.9666666984558105\n",
1016 | "280 0.9333333969116211\n",
1017 | "290 0.9333333969116211\n",
1018 | "300 0.9666666984558105\n",
1019 | "310 0.9666666984558105\n",
1020 | "320 0.9666666984558105\n",
1021 | "330 0.9333333969116211\n",
1022 | "340 1.0\n",
1023 | "350 0.9333333969116211\n",
1024 | "360 0.9666666984558105\n",
1025 | "370 0.9333333969116211\n",
1026 | "380 0.9666666984558105\n",
1027 | "390 0.9333333969116211\n",
1028 | "400 0.9666666984558105\n",
1029 | "410 0.9666666984558105\n",
1030 | "420 0.9666666984558105\n",
1031 | "430 0.9333333969116211\n",
1032 | "440 0.9333333969116211\n",
1033 | "450 0.9666666984558105\n",
1034 | "460 1.0\n",
1035 | "470 1.0\n",
1036 | "480 0.9666666984558105\n",
1037 | "490 0.9333333969116211\n",
1038 | "500 0.9666666984558105\n",
1039 | "510 0.9333333969116211\n",
1040 | "520 0.9666666984558105\n",
1041 | "530 0.9666666984558105\n",
1042 | "540 1.0\n",
1043 | "550 0.9666666984558105\n",
1044 | "560 0.9333333969116211\n",
1045 | "570 1.0\n",
1046 | "580 0.9666666984558105\n",
1047 | "590 0.9666666984558105\n",
1048 | "600 1.0\n",
1049 | "610 0.9000000357627869\n",
1050 | "620 0.9333333969116211\n",
1051 | "630 0.9333333969116211\n",
1052 | "640 0.9333333969116211\n",
1053 | "650 0.9666666984558105\n",
1054 | "660 0.9000000357627869\n",
1055 | "670 0.9000000357627869\n",
1056 | "680 1.0\n",
1057 | "690 0.9333333969116211\n",
1058 | "700 0.9666666984558105\n",
1059 | "710 0.8000000715255737\n",
1060 | "720 0.9333333969116211\n",
1061 | "730 0.8666667342185974\n",
1062 | "740 0.9333333969116211\n",
1063 | "750 0.9666666984558105\n",
1064 | "760 1.0\n",
1065 | "770 0.9333333969116211\n",
1066 | "780 0.9000000357627869\n",
1067 | "790 0.9666666984558105\n",
1068 | "800 0.9333333969116211\n",
1069 | "810 0.8666667342185974\n",
1070 | "820 0.9000000357627869\n",
1071 | "830 0.9666666984558105\n",
1072 | "avg acc: 0.9605116213111283\n",
1073 | "test acc: 0.8822142779827118\n",
1074 | "0 0.9666666984558105\n",
1075 | "10 0.9666666984558105\n",
1076 | "20 1.0\n",
1077 | "30 0.9666666984558105\n",
1078 | "40 1.0\n",
1079 | "50 0.9666666984558105\n",
1080 | "60 1.0\n",
1081 | "70 0.9000000357627869\n",
1082 | "80 1.0\n",
1083 | "90 0.9666666984558105\n",
1084 | "100 0.9333333969116211\n",
1085 | "110 1.0\n",
1086 | "120 1.0\n",
1087 | "130 0.9666666984558105\n",
1088 | "140 0.9666666984558105\n",
1089 | "150 1.0\n",
1090 | "160 0.9666666984558105\n",
1091 | "170 0.9333333969116211\n",
1092 | "180 0.9666666984558105\n",
1093 | "190 0.9333333969116211\n",
1094 | "200 0.9666666984558105\n",
1095 | "210 1.0\n",
1096 | "220 0.9666666984558105\n",
1097 | "230 1.0\n",
1098 | "240 0.9666666984558105\n",
1099 | "250 1.0\n",
1100 | "260 0.9333333969116211\n",
1101 | "270 0.9666666984558105\n",
1102 | "280 0.9000000357627869\n",
1103 | "290 1.0\n",
1104 | "300 0.9333333969116211\n",
1105 | "310 0.9666666984558105\n",
1106 | "320 0.9666666984558105\n",
1107 | "330 0.9333333969116211\n",
1108 | "340 1.0\n",
1109 | "350 0.9333333969116211\n",
1110 | "360 0.9666666984558105\n",
1111 | "370 1.0\n",
1112 | "380 1.0\n",
1113 | "390 0.9000000357627869\n",
1114 | "400 1.0\n",
1115 | "410 1.0\n",
1116 | "420 1.0\n",
1117 | "430 1.0\n",
1118 | "440 1.0\n",
1119 | "450 0.9666666984558105\n",
1120 | "460 0.9000000357627869\n",
1121 | "470 1.0\n",
1122 | "480 1.0\n",
1123 | "490 0.8666667342185974\n",
1124 | "500 1.0\n",
1125 | "510 1.0\n",
1126 | "520 1.0\n",
1127 | "530 0.9666666984558105\n",
1128 | "540 0.9000000357627869\n",
1129 | "550 1.0\n",
1130 | "560 0.9333333969116211\n",
1131 | "570 0.9666666984558105\n",
1132 | "580 1.0\n",
1133 | "590 0.9666666984558105\n",
1134 | "600 0.9333333969116211\n",
1135 | "610 0.9666666984558105\n",
1136 | "620 0.9666666984558105\n",
1137 | "630 1.0\n",
1138 | "640 0.9000000357627869\n",
1139 | "650 0.9666666984558105\n",
1140 | "660 1.0\n",
1141 | "670 0.9000000357627869\n",
1142 | "680 0.9333333969116211\n",
1143 | "690 1.0\n",
1144 | "700 1.0\n",
1145 | "710 1.0\n",
1146 | "720 0.9666666984558105\n",
1147 | "730 1.0\n",
1148 | "740 1.0\n",
1149 | "750 1.0\n",
1150 | "760 1.0\n",
1151 | "770 0.8666667342185974\n",
1152 | "780 0.9666666984558105\n",
1153 | "790 0.9333333969116211\n",
1154 | "800 0.9666666984558105\n",
1155 | "810 1.0\n",
1156 | "820 1.0\n",
1157 | "830 0.9666666984558105\n",
1158 | "avg acc: 0.9653077817363419\n",
1159 | "test acc: 0.8769784666222634\n",
1160 | "0 1.0\n",
1161 | "10 0.9666666984558105\n",
1162 | "20 1.0\n",
1163 | "30 0.9333333969116211\n",
1164 | "40 1.0\n",
1165 | "50 1.0\n",
1166 | "60 1.0\n",
1167 | "70 1.0\n",
1168 | "80 0.9666666984558105\n",
1169 | "90 0.9333333969116211\n",
1170 | "100 0.9666666984558105\n",
1171 | "110 0.9666666984558105\n",
1172 | "120 1.0\n",
1173 | "130 0.9666666984558105\n",
1174 | "140 1.0\n",
1175 | "150 1.0\n",
1176 | "160 0.9666666984558105\n",
1177 | "170 1.0\n",
1178 | "180 0.9333333969116211\n",
1179 | "190 1.0\n",
1180 | "200 0.9666666984558105\n",
1181 | "210 1.0\n",
1182 | "220 0.8333333730697632\n",
1183 | "230 1.0\n",
1184 | "240 1.0\n",
1185 | "250 0.9666666984558105\n",
1186 | "260 0.9666666984558105\n",
1187 | "270 0.9000000357627869\n",
1188 | "280 0.9666666984558105\n",
1189 | "290 0.9333333969116211\n",
1190 | "300 0.9666666984558105\n",
1191 | "310 0.9666666984558105\n",
1192 | "320 0.9333333969116211\n",
1193 | "330 1.0\n",
1194 | "340 1.0\n",
1195 | "350 0.9333333969116211\n",
1196 | "360 0.9666666984558105\n",
1197 | "370 0.9666666984558105\n",
1198 | "380 0.9666666984558105\n",
1199 | "390 1.0\n",
1200 | "400 0.9333333969116211\n",
1201 | "410 0.9333333969116211\n",
1202 | "420 1.0\n",
1203 | "430 0.9666666984558105\n",
1204 | "440 0.9666666984558105\n",
1205 | "450 0.9333333969116211\n",
1206 | "460 1.0\n",
1207 | "470 0.9666666984558105\n",
1208 | "480 1.0\n",
1209 | "490 1.0\n",
1210 | "500 0.9333333969116211\n",
1211 | "510 0.9666666984558105\n",
1212 | "520 1.0\n",
1213 | "530 0.9333333969116211\n",
1214 | "540 0.9666666984558105\n",
1215 | "550 0.9333333969116211\n",
1216 | "560 0.9333333969116211\n",
1217 | "570 0.9333333969116211\n",
1218 | "580 1.0\n",
1219 | "590 1.0\n",
1220 | "600 0.9333333969116211\n",
1221 | "610 0.9666666984558105\n",
1222 | "620 1.0\n",
1223 | "630 1.0\n",
1224 | "640 1.0\n",
1225 | "650 0.9666666984558105\n",
1226 | "660 1.0\n",
1227 | "670 1.0\n",
1228 | "680 1.0\n",
1229 | "690 0.9333333969116211\n",
1230 | "700 1.0\n",
1231 | "710 0.9333333969116211\n",
1232 | "720 1.0\n",
1233 | "730 1.0\n",
1234 | "740 0.9666666984558105\n",
1235 | "750 0.9000000357627869\n",
1236 | "760 0.9000000357627869\n",
1237 | "770 0.9333333969116211\n",
1238 | "780 0.9666666984558105\n",
1239 | "790 1.0\n",
1240 | "800 1.0\n",
1241 | "810 0.9666666984558105\n",
1242 | "820 0.9666666984558105\n",
1243 | "830 1.0\n",
1244 | "avg acc: 0.9697442299885144\n",
1245 | "test acc: 0.8815348212667506\n",
1246 | "0 0.9666666984558105\n",
1247 | "10 0.9666666984558105\n",
1248 | "20 0.9666666984558105\n",
1249 | "30 0.9666666984558105\n",
1250 | "40 0.9666666984558105\n",
1251 | "50 0.8666667342185974\n",
1252 | "60 1.0\n",
1253 | "70 1.0\n",
1254 | "80 1.0\n",
1255 | "90 1.0\n",
1256 | "100 1.0\n",
1257 | "110 0.9666666984558105\n",
1258 | "120 1.0\n",
1259 | "130 1.0\n",
1260 | "140 1.0\n",
1261 | "150 0.9666666984558105\n",
1262 | "160 1.0\n",
1263 | "170 0.9333333969116211\n",
1264 | "180 1.0\n",
1265 | "190 0.9000000357627869\n",
1266 | "200 1.0\n",
1267 | "210 0.8666667342185974\n",
1268 | "220 1.0\n",
1269 | "230 1.0\n",
1270 | "240 1.0\n",
1271 | "250 0.9000000357627869\n",
1272 | "260 1.0\n",
1273 | "270 1.0\n",
1274 | "280 0.9666666984558105\n",
1275 | "290 0.9666666984558105\n",
1276 | "300 0.9666666984558105\n",
1277 | "310 0.9666666984558105\n",
1278 | "320 0.9666666984558105\n",
1279 | "330 1.0\n",
1280 | "340 1.0\n",
1281 | "350 0.9333333969116211\n",
1282 | "360 0.9666666984558105\n",
1283 | "370 1.0\n",
1284 | "380 0.9666666984558105\n",
1285 | "390 1.0\n",
1286 | "400 0.9666666984558105\n",
1287 | "410 1.0\n",
1288 | "420 1.0\n",
1289 | "430 1.0\n",
1290 | "440 1.0\n",
1291 | "450 1.0\n",
1292 | "460 1.0\n",
1293 | "470 1.0\n",
1294 | "480 0.9666666984558105\n",
1295 | "490 1.0\n",
1296 | "500 1.0\n",
1297 | "510 1.0\n",
1298 | "520 0.9666666984558105\n",
1299 | "530 0.9666666984558105\n",
1300 | "540 0.9666666984558105\n",
1301 | "550 0.9000000357627869\n",
1302 | "560 0.9000000357627869\n",
1303 | "570 0.9666666984558105\n",
1304 | "580 1.0\n",
1305 | "590 0.9666666984558105\n",
1306 | "600 1.0\n",
1307 | "610 0.9666666984558105\n",
1308 | "620 1.0\n",
1309 | "630 0.9666666984558105\n",
1310 | "640 0.9666666984558105\n",
1311 | "650 1.0\n",
1312 | "660 0.9666666984558105\n",
1313 | "670 1.0\n",
1314 | "680 0.9666666984558105\n",
1315 | "690 0.9666666984558105\n",
1316 | "700 0.9666666984558105\n",
1317 | "710 1.0\n",
1318 | "720 0.8666667342185974\n",
1319 | "730 1.0\n",
1320 | "740 0.9666666984558105\n",
1321 | "750 0.9333333969116211\n",
1322 | "760 1.0\n",
1323 | "770 0.9666666984558105\n",
1324 | "780 1.0\n",
1325 | "790 0.9666666984558105\n",
1326 | "800 1.0\n",
1327 | "810 1.0\n",
1328 | "820 1.0\n",
1329 | "830 0.9333333969116211\n",
1330 | "avg acc: 0.9726618941453435\n",
1331 | "test acc: 0.8754996503714463\n"
1332 | ],
1333 | "name": "stdout"
1334 | }
1335 | ]
1336 | },
1337 | {
1338 | "cell_type": "code",
1339 | "metadata": {
1340 | "id": "S6Gzb78B8B1y",
1341 | "colab_type": "code",
1342 | "colab": {}
1343 | },
1344 | "source": [
1345 | "def predice_test(x):\n",
1346 | "\n",
1347 | " preds = torch.round(torch.sigmoid(x))\n",
1348 | " return preds"
1349 | ],
1350 | "execution_count": 38,
1351 | "outputs": []
1352 | },
1353 | {
1354 | "cell_type": "code",
1355 | "metadata": {
1356 | "id": "PXxZTxQY8BzW",
1357 | "colab_type": "code",
1358 | "colab": {
1359 | "base_uri": "https://localhost:8080/",
1360 | "height": 100
1361 | },
1362 | "outputId": "38c7e7d5-3710-47f2-8c44-f59918a77708"
1363 | },
1364 | "source": [
1365 | "for batch in test_iterator:\n",
1366 | " pred = rnn(batch.text).squeeze(1)\n",
1367 | " pred = predice_test(pred)\n",
1368 | " print(pred)\n",
1369 | " print(batch.label)\n",
1370 | " break\n",
1371 | "\n"
1372 | ],
1373 | "execution_count": 39,
1374 | "outputs": [
1375 | {
1376 | "output_type": "stream",
1377 | "text": [
1378 | "tensor([1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1.,\n",
1379 | " 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0.], device='cuda:0',\n",
1380 | " grad_fn=)\n",
1381 | "tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1.,\n",
1382 | " 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0.], device='cuda:0')\n"
1383 | ],
1384 | "name": "stdout"
1385 | }
1386 | ]
1387 | },
1388 | {
1389 | "cell_type": "code",
1390 | "metadata": {
1391 | "id": "VT2nZTLjb5nW",
1392 | "colab_type": "code",
1393 | "colab": {}
1394 | },
1395 | "source": [
1396 | ""
1397 | ],
1398 | "execution_count": null,
1399 | "outputs": []
1400 | }
1401 | ]
1402 | }
--------------------------------------------------------------------------------
/2.使用Bert预训练模型微调中文文本分类.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Pytorch-使用Bert预训练模型微调中文文本分类.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyP5eyOdl/S2VQKOeAW1zZ87",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "accelerator": "GPU"
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "S9q5Kcu1yQPY",
33 | "colab_type": "text"
34 | },
35 | "source": [
36 | "语料链接:https://pan.baidu.com/s/1YxGGYmeByuAlRdAVov_ZLg\n",
37 | "提取码:tzao\n",
38 | "\n",
39 | "neg.txt和pos.txt各5000条酒店评论,每条评论一行。"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "metadata": {
45 | "id": "SoH9VrHLyFiY",
46 | "colab_type": "code",
47 | "colab": {
48 | "base_uri": "https://localhost:8080/",
49 | "height": 359
50 | },
51 | "outputId": "b10075de-6ae3-4620-ab37-19de8bff3254"
52 | },
53 | "source": [
54 | "!pip install transformers"
55 | ],
56 | "execution_count": 26,
57 | "outputs": [
58 | {
59 | "output_type": "stream",
60 | "text": [
61 | "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (3.0.2)\n",
62 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n",
63 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
64 | "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.43)\n",
65 | "Requirement already satisfied: tokenizers==0.8.1.rc1 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8.1rc1)\n",
66 | "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
67 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n",
68 | "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)\n",
69 | "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n",
70 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)\n",
71 | "Requirement already satisfied: sentencepiece!=0.1.92 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.91)\n",
72 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n",
73 | "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n",
74 | "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)\n",
75 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
76 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)\n",
77 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n",
78 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
79 | "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n"
80 | ],
81 | "name": "stdout"
82 | }
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "metadata": {
88 | "id": "ry0PkCK2yerw",
89 | "colab_type": "code",
90 | "colab": {
91 | "base_uri": "https://localhost:8080/",
92 | "height": 35
93 | },
94 | "outputId": "c424b73f-0b46-4cf1-ad81-60ff1732111a"
95 | },
96 | "source": [
97 | "import numpy as np\n",
98 | "import random\n",
99 | "import torch\n",
100 | "import matplotlib.pylab as plt \n",
101 | "from torch.nn.utils import clip_grad_norm_\n",
102 | "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
103 | "from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n",
104 | "from transformers import get_linear_schedule_with_warmup\n",
105 | "\n",
106 | "SEED = 123\n",
107 | "BATCH_SIZE = 16\n",
108 | "learning_rate = 2e-5\n",
109 | "weight_decay = 1e-2 # 0.01\n",
110 | "epsilon = 1e-8\n",
111 | "\n",
112 | "random.seed(SEED)\n",
113 | "np.random.seed(SEED)\n",
114 | "torch.manual_seed(SEED)"
115 | ],
116 | "execution_count": 27,
117 | "outputs": [
118 | {
119 | "output_type": "execute_result",
120 | "data": {
121 | "text/plain": [
122 | ""
123 | ]
124 | },
125 | "metadata": {
126 | "tags": []
127 | },
128 | "execution_count": 27
129 | }
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {
135 | "id": "6gp4_kgv_MEb",
136 | "colab_type": "text"
137 | },
138 | "source": [
139 | "# 1. 数据预处理\n",
140 | "\n",
141 | "## 1.1 读取文件\n"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "metadata": {
147 | "id": "q6-4feGA2OtC",
148 | "colab_type": "code",
149 | "colab": {}
150 | },
151 | "source": [
152 | "def readFile(filename):\n",
153 | " with open(filename, encoding='utf-8') as f:\n",
154 | " content = f.readlines()\n",
155 | " return content\n",
156 | "\n",
157 | "pos_text, neg_text = readFile('./sample_data/pos.txt'), readFile('./sample_data/neg.txt')\n",
158 | "sentences = pos_text + neg_text\n",
159 | "\n",
160 | "# 设定标签\n",
161 | "pos_targets = np.ones([len(pos_text)]) # (5000, )\n",
162 | "neg_targets = np.zeros([len(neg_text)]) # (5000, )\n",
163 | "targets = np.concatenate((pos_targets, neg_targets), axis=0).reshape(-1, 1) # (10000, 1)\n",
164 | "total_targets = torch.tensor(targets)"
165 | ],
166 | "execution_count": 28,
167 | "outputs": []
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "id": "IqtLQtD5_pXN",
173 | "colab_type": "text"
174 | },
175 | "source": [
176 | "## 1.2 BertTokenizer进行编码,将每一句转成数字"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "metadata": {
182 | "id": "OViCya5Q2Q2j",
183 | "colab_type": "code",
184 | "colab": {
185 | "base_uri": "https://localhost:8080/",
186 | "height": 107
187 | },
188 | "outputId": "40548311-d5fa-4824-c1fd-d6fe0350fefa"
189 | },
190 | "source": [
191 | "model_name = 'bert-base-chinese'\n",
192 | "cache_dir = './sample_data/'\n",
193 | "\n",
194 | "tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)\n",
195 | "print(pos_text[2])\n",
196 | "print(tokenizer.tokenize(pos_text[2]))\n",
197 | "print(tokenizer.encode(pos_text[2]))\n",
198 | "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(pos_text[2])))"
199 | ],
200 | "execution_count": 29,
201 | "outputs": [
202 | {
203 | "output_type": "stream",
204 | "text": [
205 | "不错,下次还考虑入住。交通也方便,在餐厅吃的也不错。\n",
206 | "\n",
207 | "['不', '错', ',', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', ',', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。']\n",
208 | "[101, 679, 7231, 8024, 678, 3613, 6820, 5440, 5991, 1057, 857, 511, 769, 6858, 738, 3175, 912, 8024, 1762, 7623, 1324, 1391, 4638, 738, 679, 7231, 511, 102]\n",
209 | "['[CLS]', '不', '错', ',', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', ',', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。', '[SEP]']\n"
210 | ],
211 | "name": "stdout"
212 | }
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "metadata": {
218 | "id": "-5ExQ8ji2Q8f",
219 | "colab_type": "code",
220 | "colab": {
221 | "base_uri": "https://localhost:8080/",
222 | "height": 35
223 | },
224 | "outputId": "20255e57-078d-4d58-a707-f79300d8a0ac"
225 | },
226 | "source": [
227 | "# 将每一句转成数字 (大于126做截断,小于126做 Padding,加上首位两个标识,长度总共等于128)\n",
228 | "def convert_text_to_token(tokenizer, sentence, limit_size = 126):\n",
229 | " tokens = tokenizer.encode(sentence[:limit_size]) # 直接截断\n",
230 | " if len(tokens) < limit_size + 2: # 补齐(pad的索引号就是0)\n",
231 | " tokens.extend([0] * (limit_size + 2 - len(tokens)))\n",
232 | " return tokens\n",
233 | "\n",
234 | "input_ids = [convert_text_to_token(tokenizer, sen) for sen in sentences]\n",
235 | "\n",
236 | "input_tokens = torch.tensor(input_ids)\n",
237 | "print(input_tokens.shape)"
238 | ],
239 | "execution_count": 30,
240 | "outputs": [
241 | {
242 | "output_type": "stream",
243 | "text": [
244 | "torch.Size([10000, 128])\n"
245 | ],
246 | "name": "stdout"
247 | }
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {
253 | "id": "qeYRZQqRTVLS",
254 | "colab_type": "text"
255 | },
256 | "source": [
257 | "## 1.3 attention_masks, 在一个文本中,如果是PAD符号则是0,否则就是1"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "metadata": {
263 | "id": "xX2hM7hW2RFp",
264 | "colab_type": "code",
265 | "colab": {
266 | "base_uri": "https://localhost:8080/",
267 | "height": 35
268 | },
269 | "outputId": "8d6b2a04-bf63-4c11-fa7a-ea14f842dc11"
270 | },
271 | "source": [
272 | "# 建立mask\n",
273 | "def attention_masks(input_ids):\n",
274 | " atten_masks = []\n",
275 | " for seq in input_ids: # [10000, 128]\n",
276 | " seq_mask = [float(i > 0) for i in seq] # PAD: 0; 否则: 1\n",
277 | " atten_masks.append(seq_mask)\n",
278 | " return atten_masks\n",
279 | "\n",
280 | "atten_masks = attention_masks(input_ids)\n",
281 | "attention_tokens = torch.tensor(atten_masks)\n",
282 | "print(attention_tokens.shape)"
283 | ],
284 | "execution_count": 31,
285 | "outputs": [
286 | {
287 | "output_type": "stream",
288 | "text": [
289 | "torch.Size([10000, 128])\n"
290 | ],
291 | "name": "stdout"
292 | }
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {
298 | "id": "-yc1nwTdXC50",
299 | "colab_type": "text"
300 | },
301 | "source": [
302 | "- 构造input_ids和atten_masks的目的和前面一节中提到的.encode_plus函数返回的input_ids和attention_mask一样\n",
303 | "\n",
304 | "- input_type_ids和本次任务无关,它是针对每个训练集有两个句子的任务(如问答任务)。"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "metadata": {
310 | "id": "ottvMKol2RLE",
311 | "colab_type": "code",
312 | "colab": {
313 | "base_uri": "https://localhost:8080/",
314 | "height": 431
315 | },
316 | "outputId": "fcec2cae-960b-4b63-d8fa-3720bf5ac75d"
317 | },
318 | "source": [
319 | "from sklearn.model_selection import train_test_split\n",
320 | "\n",
321 | "train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens, total_targets, \n",
322 | " random_state=666, test_size=0.2)\n",
323 | "train_masks, test_masks, _, _ = train_test_split(attention_tokens, input_tokens, \n",
324 | " random_state=666, test_size=0.2)\n",
325 | "print(train_inputs.shape, test_inputs.shape) # torch.Size([8000, 128]) torch.Size([2000, 128])\n",
326 | "print(train_masks.shape) # torch.Size([8000, 128]) 和 train_inputs形状一样\n",
327 | "\n",
328 | "print(train_inputs[0])\n",
329 | "print(train_masks[0])"
330 | ],
331 | "execution_count": 32,
332 | "outputs": [
333 | {
334 | "output_type": "stream",
335 | "text": [
336 | "torch.Size([8000, 128]) torch.Size([2000, 128])\n",
337 | "torch.Size([8000, 128])\n",
338 | "tensor([ 101, 2769, 6370, 4638, 3221, 10189, 1039, 4638, 117, 852,\n",
339 | " 2769, 6230, 2533, 8821, 1039, 4638, 7599, 3419, 3291, 1962,\n",
340 | " 671, 763, 117, 3300, 671, 2476, 1377, 809, 1288, 1309,\n",
341 | " 4638, 3763, 1355, 119, 2456, 6379, 1920, 2157, 6370, 3249,\n",
342 | " 6858, 7313, 106, 102, 0, 0, 0, 0, 0, 0,\n",
343 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
344 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
345 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
346 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
347 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
348 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
349 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
350 | " 0, 0, 0, 0, 0, 0, 0, 0])\n",
351 | "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
352 | " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
353 | " 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
354 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
355 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
356 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
357 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
358 | " 0., 0.])\n"
359 | ],
360 | "name": "stdout"
361 | }
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {
367 | "id": "A4w6AQZvgzce",
368 | "colab_type": "text"
369 | },
370 | "source": [
371 | "## 2.5 创建DataLoader,用来取出一个batch的数据\n",
372 | "\n",
373 | "TensorDataset 可以用来对 tensor 进行打包,就好像 python 中的 zip 功能。\n",
374 | "\n",
375 | "该类通过每一个 tensor 的第一个维度进行索引,所以该类中的 tensor 第一维度必须相等,且TensorDataset 中的参数必须是 tensor类型。\n",
376 | "\n",
377 | "RandomSampler对数据集随机采样。\n",
378 | "SequentialSampler按顺序对数据集采样。"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "metadata": {
384 | "id": "VBiQ9Bcg2RO4",
385 | "colab_type": "code",
386 | "colab": {}
387 | },
388 | "source": [
389 | "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n",
390 | "train_sampler = RandomSampler(train_data)\n",
391 | "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)\n",
392 | "\n",
393 | "test_data = TensorDataset(test_inputs, test_masks, test_labels)\n",
394 | "test_sampler = RandomSampler(test_data)\n",
395 | "test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)"
396 | ],
397 | "execution_count": 33,
398 | "outputs": []
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "metadata": {
403 | "id": "hJ1tYqiKnqGj",
404 | "colab_type": "text"
405 | },
406 | "source": [
407 | "查看一下train_dataloader的内容:"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "metadata": {
413 | "id": "FGWOL5ED2RI-",
414 | "colab_type": "code",
415 | "colab": {
416 | "base_uri": "https://localhost:8080/",
417 | "height": 53
418 | },
419 | "outputId": "b14082c0-69f7-4f97-9074-bcdda3d34ab3"
420 | },
421 | "source": [
422 | "for i, (train, mask, label) in enumerate(train_dataloader): # torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])\n",
423 | " print(train.shape, mask.shape, label.shape)\n",
424 | " break\n",
425 | "\n",
426 | "print('len(train_dataloader) = ', len(train_dataloader)) # 500"
427 | ],
428 | "execution_count": 34,
429 | "outputs": [
430 | {
431 | "output_type": "stream",
432 | "text": [
433 | "torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])\n",
434 | "len(train_dataloader) = 500\n"
435 | ],
436 | "name": "stdout"
437 | }
438 | ]
439 | },
440 | {
441 | "cell_type": "markdown",
442 | "metadata": {
443 | "id": "Z6PXyoEvofak",
444 | "colab_type": "text"
445 | },
446 | "source": [
447 | "# 3. 创建模型、优化器\n",
448 | "\n",
449 | "## 3.1 创建模型"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "metadata": {
455 | "id": "JfcaltZp2RCg",
456 | "colab_type": "code",
457 | "colab": {
458 | "base_uri": "https://localhost:8080/",
459 | "height": 1000
460 | },
461 | "outputId": "1601dfa1-2f0b-41bb-c488-b19f050194b1"
462 | },
463 | "source": [
464 | "model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2) # num_labels表示2个分类,好评和差评\n",
465 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
466 | "model.to(device)"
467 | ],
468 | "execution_count": 35,
469 | "outputs": [
470 | {
471 | "output_type": "stream",
472 | "text": [
473 | "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
474 | "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
475 | "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
476 | "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
477 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
478 | ],
479 | "name": "stderr"
480 | },
481 | {
482 | "output_type": "execute_result",
483 | "data": {
484 | "text/plain": [
485 | "BertForSequenceClassification(\n",
486 | " (bert): BertModel(\n",
487 | " (embeddings): BertEmbeddings(\n",
488 | " (word_embeddings): Embedding(21128, 768, padding_idx=0)\n",
489 | " (position_embeddings): Embedding(512, 768)\n",
490 | " (token_type_embeddings): Embedding(2, 768)\n",
491 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
492 | " (dropout): Dropout(p=0.1, inplace=False)\n",
493 | " )\n",
494 | " (encoder): BertEncoder(\n",
495 | " (layer): ModuleList(\n",
496 | " (0): BertLayer(\n",
497 | " (attention): BertAttention(\n",
498 | " (self): BertSelfAttention(\n",
499 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
500 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
501 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
502 | " (dropout): Dropout(p=0.1, inplace=False)\n",
503 | " )\n",
504 | " (output): BertSelfOutput(\n",
505 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
506 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
507 | " (dropout): Dropout(p=0.1, inplace=False)\n",
508 | " )\n",
509 | " )\n",
510 | " (intermediate): BertIntermediate(\n",
511 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
512 | " )\n",
513 | " (output): BertOutput(\n",
514 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
515 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
516 | " (dropout): Dropout(p=0.1, inplace=False)\n",
517 | " )\n",
518 | " )\n",
519 | " (1): BertLayer(\n",
520 | " (attention): BertAttention(\n",
521 | " (self): BertSelfAttention(\n",
522 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
523 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
524 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
525 | " (dropout): Dropout(p=0.1, inplace=False)\n",
526 | " )\n",
527 | " (output): BertSelfOutput(\n",
528 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
529 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
530 | " (dropout): Dropout(p=0.1, inplace=False)\n",
531 | " )\n",
532 | " )\n",
533 | " (intermediate): BertIntermediate(\n",
534 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
535 | " )\n",
536 | " (output): BertOutput(\n",
537 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
538 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
539 | " (dropout): Dropout(p=0.1, inplace=False)\n",
540 | " )\n",
541 | " )\n",
542 | " (2): BertLayer(\n",
543 | " (attention): BertAttention(\n",
544 | " (self): BertSelfAttention(\n",
545 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
546 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
547 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
548 | " (dropout): Dropout(p=0.1, inplace=False)\n",
549 | " )\n",
550 | " (output): BertSelfOutput(\n",
551 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
552 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
553 | " (dropout): Dropout(p=0.1, inplace=False)\n",
554 | " )\n",
555 | " )\n",
556 | " (intermediate): BertIntermediate(\n",
557 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
558 | " )\n",
559 | " (output): BertOutput(\n",
560 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
561 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
562 | " (dropout): Dropout(p=0.1, inplace=False)\n",
563 | " )\n",
564 | " )\n",
565 | " (3): BertLayer(\n",
566 | " (attention): BertAttention(\n",
567 | " (self): BertSelfAttention(\n",
568 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
569 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
570 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
571 | " (dropout): Dropout(p=0.1, inplace=False)\n",
572 | " )\n",
573 | " (output): BertSelfOutput(\n",
574 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
575 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
576 | " (dropout): Dropout(p=0.1, inplace=False)\n",
577 | " )\n",
578 | " )\n",
579 | " (intermediate): BertIntermediate(\n",
580 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
581 | " )\n",
582 | " (output): BertOutput(\n",
583 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
584 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
585 | " (dropout): Dropout(p=0.1, inplace=False)\n",
586 | " )\n",
587 | " )\n",
588 | " (4): BertLayer(\n",
589 | " (attention): BertAttention(\n",
590 | " (self): BertSelfAttention(\n",
591 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
592 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
593 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
594 | " (dropout): Dropout(p=0.1, inplace=False)\n",
595 | " )\n",
596 | " (output): BertSelfOutput(\n",
597 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
598 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
599 | " (dropout): Dropout(p=0.1, inplace=False)\n",
600 | " )\n",
601 | " )\n",
602 | " (intermediate): BertIntermediate(\n",
603 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
604 | " )\n",
605 | " (output): BertOutput(\n",
606 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
607 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
608 | " (dropout): Dropout(p=0.1, inplace=False)\n",
609 | " )\n",
610 | " )\n",
611 | " (5): BertLayer(\n",
612 | " (attention): BertAttention(\n",
613 | " (self): BertSelfAttention(\n",
614 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
615 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
616 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
617 | " (dropout): Dropout(p=0.1, inplace=False)\n",
618 | " )\n",
619 | " (output): BertSelfOutput(\n",
620 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
621 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
622 | " (dropout): Dropout(p=0.1, inplace=False)\n",
623 | " )\n",
624 | " )\n",
625 | " (intermediate): BertIntermediate(\n",
626 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
627 | " )\n",
628 | " (output): BertOutput(\n",
629 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
630 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
631 | " (dropout): Dropout(p=0.1, inplace=False)\n",
632 | " )\n",
633 | " )\n",
634 | " (6): BertLayer(\n",
635 | " (attention): BertAttention(\n",
636 | " (self): BertSelfAttention(\n",
637 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
638 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
639 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
640 | " (dropout): Dropout(p=0.1, inplace=False)\n",
641 | " )\n",
642 | " (output): BertSelfOutput(\n",
643 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
644 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
645 | " (dropout): Dropout(p=0.1, inplace=False)\n",
646 | " )\n",
647 | " )\n",
648 | " (intermediate): BertIntermediate(\n",
649 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
650 | " )\n",
651 | " (output): BertOutput(\n",
652 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
653 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
654 | " (dropout): Dropout(p=0.1, inplace=False)\n",
655 | " )\n",
656 | " )\n",
657 | " (7): BertLayer(\n",
658 | " (attention): BertAttention(\n",
659 | " (self): BertSelfAttention(\n",
660 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
661 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
662 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
663 | " (dropout): Dropout(p=0.1, inplace=False)\n",
664 | " )\n",
665 | " (output): BertSelfOutput(\n",
666 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
667 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
668 | " (dropout): Dropout(p=0.1, inplace=False)\n",
669 | " )\n",
670 | " )\n",
671 | " (intermediate): BertIntermediate(\n",
672 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
673 | " )\n",
674 | " (output): BertOutput(\n",
675 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
676 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
677 | " (dropout): Dropout(p=0.1, inplace=False)\n",
678 | " )\n",
679 | " )\n",
680 | " (8): BertLayer(\n",
681 | " (attention): BertAttention(\n",
682 | " (self): BertSelfAttention(\n",
683 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
684 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
685 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
686 | " (dropout): Dropout(p=0.1, inplace=False)\n",
687 | " )\n",
688 | " (output): BertSelfOutput(\n",
689 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
690 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
691 | " (dropout): Dropout(p=0.1, inplace=False)\n",
692 | " )\n",
693 | " )\n",
694 | " (intermediate): BertIntermediate(\n",
695 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
696 | " )\n",
697 | " (output): BertOutput(\n",
698 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
699 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
700 | " (dropout): Dropout(p=0.1, inplace=False)\n",
701 | " )\n",
702 | " )\n",
703 | " (9): BertLayer(\n",
704 | " (attention): BertAttention(\n",
705 | " (self): BertSelfAttention(\n",
706 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
707 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
708 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
709 | " (dropout): Dropout(p=0.1, inplace=False)\n",
710 | " )\n",
711 | " (output): BertSelfOutput(\n",
712 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
713 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
714 | " (dropout): Dropout(p=0.1, inplace=False)\n",
715 | " )\n",
716 | " )\n",
717 | " (intermediate): BertIntermediate(\n",
718 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
719 | " )\n",
720 | " (output): BertOutput(\n",
721 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
722 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
723 | " (dropout): Dropout(p=0.1, inplace=False)\n",
724 | " )\n",
725 | " )\n",
726 | " (10): BertLayer(\n",
727 | " (attention): BertAttention(\n",
728 | " (self): BertSelfAttention(\n",
729 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
730 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
731 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
732 | " (dropout): Dropout(p=0.1, inplace=False)\n",
733 | " )\n",
734 | " (output): BertSelfOutput(\n",
735 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
736 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
737 | " (dropout): Dropout(p=0.1, inplace=False)\n",
738 | " )\n",
739 | " )\n",
740 | " (intermediate): BertIntermediate(\n",
741 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
742 | " )\n",
743 | " (output): BertOutput(\n",
744 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
745 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
746 | " (dropout): Dropout(p=0.1, inplace=False)\n",
747 | " )\n",
748 | " )\n",
749 | " (11): BertLayer(\n",
750 | " (attention): BertAttention(\n",
751 | " (self): BertSelfAttention(\n",
752 | " (query): Linear(in_features=768, out_features=768, bias=True)\n",
753 | " (key): Linear(in_features=768, out_features=768, bias=True)\n",
754 | " (value): Linear(in_features=768, out_features=768, bias=True)\n",
755 | " (dropout): Dropout(p=0.1, inplace=False)\n",
756 | " )\n",
757 | " (output): BertSelfOutput(\n",
758 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
759 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
760 | " (dropout): Dropout(p=0.1, inplace=False)\n",
761 | " )\n",
762 | " )\n",
763 | " (intermediate): BertIntermediate(\n",
764 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
765 | " )\n",
766 | " (output): BertOutput(\n",
767 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
768 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
769 | " (dropout): Dropout(p=0.1, inplace=False)\n",
770 | " )\n",
771 | " )\n",
772 | " )\n",
773 | " )\n",
774 | " (pooler): BertPooler(\n",
775 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
776 | " (activation): Tanh()\n",
777 | " )\n",
778 | " )\n",
779 | " (dropout): Dropout(p=0.1, inplace=False)\n",
780 | " (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
781 | ")"
782 | ]
783 | },
784 | "metadata": {
785 | "tags": []
786 | },
787 | "execution_count": 35
788 | }
789 | ]
790 | },
791 | {
792 | "cell_type": "markdown",
793 | "metadata": {
794 | "id": "V6q1Bw6CqZk8",
795 | "colab_type": "text"
796 | },
797 | "source": [
798 | "## 3.2 定义优化器\n",
799 | "\n",
800 | "参数eps是为了提高数值稳定性而添加到分母的一个项(默认: 1e-8)。\n"
801 | ]
802 | },
803 | {
804 | "cell_type": "code",
805 | "metadata": {
806 | "id": "KTVRhp7ipUfe",
807 | "colab_type": "code",
808 | "colab": {}
809 | },
810 | "source": [
811 | "optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)"
812 | ],
813 | "execution_count": 36,
814 | "outputs": []
815 | },
816 | {
817 | "cell_type": "markdown",
818 | "metadata": {
819 | "id": "xiOZPF6P4pMP",
820 | "colab_type": "text"
821 | },
822 | "source": [
823 | "更通用的写法:bias和LayNorm.weight没有用权重衰减"
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "metadata": {
829 | "id": "2lW1VZyWqzYW",
830 | "colab_type": "code",
831 | "colab": {}
832 | },
833 | "source": [
834 | "no_decay = ['bias', 'LayerNorm.weight']\n",
835 | "optimizer_grouped_parameters = [\n",
836 | " {'params' : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n",
837 | " 'weight_decay' : weight_decay},\n",
838 | " {'params' : [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n",
839 | " 'weight_decay' : 0.0}\n",
840 | "]\n",
841 | "\n",
842 | "optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = epsilon)"
843 | ],
844 | "execution_count": 42,
845 | "outputs": []
846 | },
847 | {
848 | "cell_type": "markdown",
849 | "metadata": {
850 | "id": "VXe2LYE04Ri8",
851 | "colab_type": "text"
852 | },
853 | "source": [
854 | " ## 3.3 学习率预热,训练时先从小的学习率开始训练"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "metadata": {
860 | "id": "XhYuopvX2yhB",
861 | "colab_type": "code",
862 | "colab": {}
863 | },
864 | "source": [
865 | "epochs = 2\n",
866 | "# training steps 的数量: [number of batches] x [number of epochs].\n",
867 | "total_steps = len(train_dataloader) * epochs\n",
868 | "\n",
869 | "# 设计 learning rate scheduler.\n",
870 | "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, \n",
871 | " num_training_steps = total_steps)"
872 | ],
873 | "execution_count": 44,
874 | "outputs": []
875 | },
876 | {
877 | "cell_type": "markdown",
878 | "metadata": {
879 | "id": "c2-LLDoW6cZ7",
880 | "colab_type": "text"
881 | },
882 | "source": [
883 | "# 4.训练、评估模型 \n",
884 | "\n",
885 | "## 4.1 模型准确率\n"
886 | ]
887 | },
888 | {
889 | "cell_type": "code",
890 | "metadata": {
891 | "id": "WFl32Ko724PI",
892 | "colab_type": "code",
893 | "colab": {}
894 | },
895 | "source": [
896 | "def binary_acc(preds, labels):\n",
897 | " correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float() # eq里面的两个参数的shape=torch.Size([16])\n",
898 | " acc = correct.sum().item() / len(correct)\n",
899 | " return acc"
900 | ],
901 | "execution_count": 45,
902 | "outputs": []
903 | },
904 | {
905 | "cell_type": "markdown",
906 | "metadata": {
907 | "id": "1i32sHMh_GqR",
908 | "colab_type": "text"
909 | },
910 | "source": [
911 | "## 4.2 计算模型运行时间"
912 | ]
913 | },
914 | {
915 | "cell_type": "code",
916 | "metadata": {
917 | "id": "L2CJDlV4-T2z",
918 | "colab_type": "code",
919 | "colab": {}
920 | },
921 | "source": [
922 | "import time\n",
923 | "import datetime\n",
924 | "\n",
925 | "def format_time(elapsed):\n",
926 | " elapsed_rounded = int(round(elapsed))\n",
927 | " return str(datetime.timedelta(seconds = elapsed_rounded)) # 返回 hh:mm:ss 形式的时间"
928 | ],
929 | "execution_count": 46,
930 | "outputs": []
931 | },
932 | {
933 | "cell_type": "markdown",
934 | "metadata": {
935 | "id": "92O44C1-_0G-",
936 | "colab_type": "text"
937 | },
938 | "source": [
939 | "## 4.3 训练模型\n",
940 | "\n",
941 | "- 传入model的参数必须是tensor类型的;\n",
942 | "\n",
943 | "- nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=2)用于解决神经网络训练过拟合的方法 ;\n",
944 | "\n",
945 | "输入是(NN参数,最大梯度范数,范数类型=2) 一般默认为L2 范数;\n",
946 | "\n",
947 | "Tip: 注意这个方法只在训练的时候使用,在测试的时候不用;"
948 | ]
949 | },
950 | {
951 | "cell_type": "code",
952 | "metadata": {
953 | "id": "iGUJfpT6_rjg",
954 | "colab_type": "code",
955 | "colab": {}
956 | },
957 | "source": [
958 | "def train(model, optimizer):\n",
959 | " t0 = time.time()\n",
960 | " avg_loss, avg_acc = [],[]\n",
961 | "\n",
962 | " model.train()\n",
963 | " for step, batch in enumerate(train_dataloader):\n",
964 | "\n",
965 | " # 每隔40个batch 输出一下所用时间.\n",
966 | " if step % 40 == 0 and not step == 0:\n",
967 | " elapsed = format_time(time.time() - t0)\n",
968 | " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n",
969 | "\n",
970 | " b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)\n",
971 | "\n",
972 | " output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)\n",
973 | " loss, logits = output[0], output[1]\n",
974 | "\n",
975 | " avg_loss.append(loss.item())\n",
976 | "\n",
977 | " acc = binary_acc(logits, b_labels)\n",
978 | " avg_acc.append(acc)\n",
979 | "\n",
980 | " optimizer.zero_grad()\n",
981 | " loss.backward()\n",
982 | " clip_grad_norm_(model.parameters(), 1.0) #大于1的梯度将其设为1.0, 以防梯度爆炸\n",
983 | " optimizer.step() #更新模型参数\n",
984 | " scheduler.step() #更新learning rate\n",
985 | "\n",
986 | " avg_acc = np.array(avg_acc).mean()\n",
987 | " avg_loss = np.array(avg_loss).mean()\n",
988 | " return avg_loss, avg_acc"
989 | ],
990 | "execution_count": 47,
991 | "outputs": []
992 | },
993 | {
994 | "cell_type": "markdown",
995 | "metadata": {
996 | "id": "hGIZrUVhBVDR",
997 | "colab_type": "text"
998 | },
999 | "source": [
1000 | "此处output的形式为(元组类型,第0个元素是loss值,第1个元素是每个batch中好评和差评的概率):"
1001 | ]
1002 | },
1003 | {
1004 | "cell_type": "markdown",
1005 | "metadata": {
1006 | "id": "qDLQ9RobB9lt",
1007 | "colab_type": "text"
1008 | },
1009 | "source": [
1010 | "```\n",
1011 | "(tensor(0.0210, device='cuda:0', grad_fn=), \n",
1012 | "tensor([[-2.9815, 2.6931],\n",
1013 | " [-3.2380, 3.1935],\n",
1014 | " [-3.0775, 3.0713],\n",
1015 | " [ 3.0191, -2.3689],\n",
1016 | " [ 3.1146, -2.7957],\n",
1017 | " [ 3.7798, -2.7410],\n",
1018 | " [-0.3273, 0.8227],\n",
1019 | " [ 2.5012, -1.5535],\n",
1020 | " [-3.0231, 3.0162],\n",
1021 | " [ 3.4146, -2.5582],\n",
1022 | " [ 3.3104, -2.2134],\n",
1023 | " [ 3.3776, -2.5190],\n",
1024 | " [-2.6513, 2.5108],\n",
1025 | " [-3.3691, 2.9516],\n",
1026 | " [ 3.2397, -2.0473],\n",
1027 | " [-2.8622, 2.7395]], device='cuda:0', grad_fn=))\n",
1028 | "```\n"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "markdown",
1033 | "metadata": {
1034 | "id": "RW6qjSeBBfhk",
1035 | "colab_type": "text"
1036 | },
1037 | "source": [
1038 | "## 4.4 评估模型\n",
1039 | "\n",
1040 | "调用model模型时不传入label值。"
1041 | ]
1042 | },
1043 | {
1044 | "cell_type": "code",
1045 | "metadata": {
1046 | "id": "O5ELFI4bBQei",
1047 | "colab_type": "code",
1048 | "colab": {}
1049 | },
1050 | "source": [
1051 | "def evaluate(model):\n",
1052 | " avg_acc = []\n",
1053 | " model.eval() #表示进入测试模式\n",
1054 | "\n",
1055 | " with torch.no_grad():\n",
1056 | " for batch in test_dataloader:\n",
1057 | " b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)\n",
1058 | "\n",
1059 | " output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n",
1060 | "\n",
1061 | " acc = binary_acc(output[0], b_labels)\n",
1062 | " avg_acc.append(acc)\n",
1063 | " avg_acc = np.array(avg_acc).mean()\n",
1064 | " return avg_acc"
1065 | ],
1066 | "execution_count": 49,
1067 | "outputs": []
1068 | },
1069 | {
1070 | "cell_type": "markdown",
1071 | "metadata": {
1072 | "id": "-rZvcWp4BytL",
1073 | "colab_type": "text"
1074 | },
1075 | "source": [
1076 | "此处output的形式为(元组类型,第0个元素是每个batch中好评和差评的概率):"
1077 | ]
1078 | },
1079 | {
1080 | "cell_type": "markdown",
1081 | "metadata": {
1082 | "id": "i5ZiiaTjB0iu",
1083 | "colab_type": "text"
1084 | },
1085 | "source": [
1086 | "```\n",
1087 | "(tensor([[ 3.8217, -2.7516],\n",
1088 | " [ 2.7585, -2.0853],\n",
1089 | " [-2.9317, 2.9092],\n",
1090 | " [-3.3724, 3.2597],\n",
1091 | " [-2.8692, 2.6741],\n",
1092 | " [-3.2784, 2.9276],\n",
1093 | " [ 3.4946, -2.8895],\n",
1094 | " [ 3.7855, -2.8623],\n",
1095 | " [-2.2249, 2.4336],\n",
1096 | " [-2.4257, 2.4606],\n",
1097 | " [ 3.3996, -2.5760],\n",
1098 | " [-3.1986, 3.0841],\n",
1099 | " [ 3.6883, -2.9492],\n",
1100 | " [ 3.2883, -2.3600],\n",
1101 | " [ 2.6723, -2.0778],\n",
1102 | " [-3.1868, 3.1106]], device='cuda:0'),)\n",
1103 | "```"
1104 | ]
1105 | },
1106 | {
1107 | "cell_type": "markdown",
1108 | "metadata": {
1109 | "id": "5EkPkI5eCHRo",
1110 | "colab_type": "text"
1111 | },
1112 | "source": [
1113 | "## 4.5 运行训练模型和评估模型"
1114 | ]
1115 | },
1116 | {
1117 | "cell_type": "code",
1118 | "metadata": {
1119 | "id": "1jVbe00RBSLf",
1120 | "colab_type": "code",
1121 | "colab": {
1122 | "base_uri": "https://localhost:8080/",
1123 | "height": 521
1124 | },
1125 | "outputId": "cee6bed2-7f35-4b36-ca5e-9be3f0e1501c"
1126 | },
1127 | "source": [
1128 | "for epoch in range(epochs):\n",
1129 | "\n",
1130 | " train_loss, train_acc = train(model, optimizer)\n",
1131 | " print('epoch={},训练准确率={},损失={}'.format(epoch, train_acc, train_loss))\n",
1132 | " test_acc = evaluate(model)\n",
1133 | " print(\"epoch={},测试准确率={}\".format(epoch, test_acc))"
1134 | ],
1135 | "execution_count": 50,
1136 | "outputs": [
1137 | {
1138 | "output_type": "stream",
1139 | "text": [
1140 | " Batch 40 of 500. Elapsed: 0:00:27.\n",
1141 | " Batch 80 of 500. Elapsed: 0:00:53.\n",
1142 | " Batch 120 of 500. Elapsed: 0:01:20.\n",
1143 | " Batch 160 of 500. Elapsed: 0:01:47.\n",
1144 | " Batch 200 of 500. Elapsed: 0:02:14.\n",
1145 | " Batch 240 of 500. Elapsed: 0:02:40.\n",
1146 | " Batch 280 of 500. Elapsed: 0:03:07.\n",
1147 | " Batch 320 of 500. Elapsed: 0:03:34.\n",
1148 | " Batch 360 of 500. Elapsed: 0:04:01.\n",
1149 | " Batch 400 of 500. Elapsed: 0:04:28.\n",
1150 | " Batch 440 of 500. Elapsed: 0:04:55.\n",
1151 | " Batch 480 of 500. Elapsed: 0:05:22.\n",
1152 | "epoch=0,训练准确率=0.90275,损失=0.2619755164962262\n",
1153 | "epoch=0,测试准确率=0.9325\n",
1154 | " Batch 40 of 500. Elapsed: 0:00:27.\n",
1155 | " Batch 80 of 500. Elapsed: 0:00:53.\n",
1156 | " Batch 120 of 500. Elapsed: 0:01:20.\n",
1157 | " Batch 160 of 500. Elapsed: 0:01:47.\n",
1158 | " Batch 200 of 500. Elapsed: 0:02:14.\n",
1159 | " Batch 240 of 500. Elapsed: 0:02:41.\n",
1160 | " Batch 280 of 500. Elapsed: 0:03:08.\n",
1161 | " Batch 320 of 500. Elapsed: 0:03:35.\n",
1162 | " Batch 360 of 500. Elapsed: 0:04:02.\n",
1163 | " Batch 400 of 500. Elapsed: 0:04:28.\n",
1164 | " Batch 440 of 500. Elapsed: 0:04:55.\n",
1165 | " Batch 480 of 500. Elapsed: 0:05:22.\n",
1166 | "epoch=1,训练准确率=0.953375,损失=0.15345162890665234\n",
1167 | "epoch=1,测试准确率=0.9435\n"
1168 | ],
1169 | "name": "stdout"
1170 | }
1171 | ]
1172 | },
1173 | {
1174 | "cell_type": "markdown",
1175 | "metadata": {
1176 | "id": "QchQuTWDCM6M",
1177 | "colab_type": "text"
1178 | },
1179 | "source": [
1180 | "# 5. 预测"
1181 | ]
1182 | },
1183 | {
1184 | "cell_type": "code",
1185 | "metadata": {
1186 | "id": "XpM5amxqCKbk",
1187 | "colab_type": "code",
1188 | "colab": {
1189 | "base_uri": "https://localhost:8080/",
1190 | "height": 125
1191 | },
1192 | "outputId": "ff51d8f9-7797-480a-d326-f54c1c0eebc9"
1193 | },
1194 | "source": [
1195 | "def predict(sen):\n",
1196 | "\n",
1197 | " input_id = convert_text_to_token(tokenizer, sen)\n",
1198 | " input_token = torch.tensor(input_id).long().to(device) #torch.Size([128])\n",
1199 | "\n",
1200 | " atten_mask = [float(i>0) for i in input_id]\n",
1201 | " attention_token = torch.tensor(atten_mask).long().to(device) #torch.Size([128])\n",
1202 | "\n",
1203 | " output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_token.view(1, -1)) #torch.Size([128])->torch.Size([1, 128])否则会报错\n",
1204 | " print(output[0])\n",
1205 | "\n",
1206 | " return torch.max(output[0], dim=1)[1]\n",
1207 | "\n",
1208 | "label = predict('酒店位置难找,环境不太好,隔音差,下次不会再来的。')\n",
1209 | "print('好评' if label==1 else '差评')\n",
1210 | "\n",
1211 | "label = predict('酒店还可以,接待人员很热情,卫生合格,空间也比较大,不足的地方就是没有窗户')\n",
1212 | "print('好评' if label==1 else '差评')\n",
1213 | "\n",
1214 | "label = predict('\"服务各方面没有不周到的地方, 各方面没有没想到的细节\"')\n",
1215 | "print('好评' if label==1 else '差评')"
1216 | ],
1217 | "execution_count": 51,
1218 | "outputs": [
1219 | {
1220 | "output_type": "stream",
1221 | "text": [
1222 | "tensor([[ 2.3040, -4.0122]], device='cuda:0', grad_fn=)\n",
1223 | "差评\n",
1224 | "tensor([[-1.5570, 2.5071]], device='cuda:0', grad_fn=)\n",
1225 | "好评\n",
1226 | "tensor([[ 0.3791, -1.3262]], device='cuda:0', grad_fn=)\n",
1227 | "差评\n"
1228 | ],
1229 | "name": "stdout"
1230 | }
1231 | ]
1232 | },
1233 | {
1234 | "cell_type": "code",
1235 | "metadata": {
1236 | "id": "QsTJVQSrD9XR",
1237 | "colab_type": "code",
1238 | "colab": {}
1239 | },
1240 | "source": [
1241 | ""
1242 | ],
1243 | "execution_count": null,
1244 | "outputs": []
1245 | }
1246 | ]
1247 | }
--------------------------------------------------------------------------------
/Bert手写版本+MLM+NSP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "kernelspec": {
6 | "display_name": "Python 3",
7 | "language": "python",
8 | "name": "python3"
9 | },
10 | "language_info": {
11 | "codemirror_mode": {
12 | "name": "ipython",
13 | "version": 3
14 | },
15 | "file_extension": ".py",
16 | "mimetype": "text/x-python",
17 | "name": "python",
18 | "nbconvert_exporter": "python",
19 | "pygments_lexer": "ipython3",
20 | "version": "3.7.9"
21 | },
22 | "nbTranslate": {
23 | "displayLangs": [
24 | "*"
25 | ],
26 | "hotkey": "alt-t",
27 | "langInMainMenu": true,
28 | "sourceLang": "en",
29 | "targetLang": "cn",
30 | "useGoogleTranslate": true
31 | },
32 | "toc": {
33 | "base_numbering": 1,
34 | "nav_menu": {
35 | "height": "168.991px",
36 | "width": "201.634px"
37 | },
38 | "number_sections": true,
39 | "sideBar": true,
40 | "skip_h1_title": false,
41 | "title_cell": "Table of Contents",
42 | "title_sidebar": "Contents",
43 | "toc_cell": false,
44 | "toc_position": {
45 | "height": "calc(100% - 180px)",
46 | "left": "10px",
47 | "top": "150px",
48 | "width": "191.094px"
49 | },
50 | "toc_section_display": true,
51 | "toc_window_display": true
52 | },
53 | "varInspector": {
54 | "cols": {
55 | "lenName": 16,
56 | "lenType": 16,
57 | "lenVar": 40
58 | },
59 | "kernels_config": {
60 | "python": {
61 | "delete_cmd_postfix": "",
62 | "delete_cmd_prefix": "del ",
63 | "library": "var_list.py",
64 | "varRefreshCmd": "print(var_dic_list())"
65 | },
66 | "r": {
67 | "delete_cmd_postfix": ") ",
68 | "delete_cmd_prefix": "rm(",
69 | "library": "var_list.r",
70 | "varRefreshCmd": "cat(var_dic_list()) "
71 | }
72 | },
73 | "position": {
74 | "height": "398px",
75 | "left": "840px",
76 | "right": "20px",
77 | "top": "77px",
78 | "width": "505px"
79 | },
80 | "types_to_exclude": [
81 | "module",
82 | "function",
83 | "builtin_function_or_method",
84 | "instance",
85 | "_Feature"
86 | ],
87 | "window_display": false
88 | },
89 | "colab": {
90 | "name": "Bert手写版本+MLM+NSP.ipynb",
91 | "provenance": [],
92 | "include_colab_link": true
93 | }
94 | },
95 | "cells": [
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {
99 | "id": "view-in-github",
100 | "colab_type": "text"
101 | },
102 | "source": [
103 | "
"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {
109 | "id": "RUG6EgDHHOWm"
110 | },
111 | "source": [
112 | "# Bert手写版本+MLM+NSP"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "metadata": {
118 | "ExecuteTime": {
119 | "end_time": "2020-10-11T14:16:40.637738Z",
120 | "start_time": "2020-10-11T14:16:39.974576Z"
121 | },
122 | "id": "1xxvnLCcHOWq"
123 | },
124 | "source": [
125 | "import re\n",
126 | "import math\n",
127 | "import torch\n",
128 | "import numpy as np\n",
129 | "from random import *\n",
130 | "import torch.nn as nn\n",
131 | "import torch.nn.functional as F \n",
132 | "import torch.optim as optim\n",
133 | "import torch.utils.data as Data"
134 | ],
135 | "execution_count": null,
136 | "outputs": []
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {
141 | "id": "7ZA_5gbPHOWz"
142 | },
143 | "source": [
144 | "# 数据预处理\n",
145 | "\n",
146 | "## 构造单词表和映射"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "metadata": {
152 | "ExecuteTime": {
153 | "end_time": "2020-10-11T14:16:40.677834Z",
154 | "start_time": "2020-10-11T14:16:40.670965Z"
155 | },
156 | "code_folding": [],
157 | "id": "xGnYUQDkHOW1"
158 | },
159 | "source": [
160 | "text = (\n",
161 | " 'Hello, how are you? I am Romeo.\\n' # R\n",
162 | " 'Hello, Romeo My name is Juliet. Nice to meet you.\\n' # J\n",
163 | " 'Nice to meet you too. How are you today?\\n' # R\n",
164 | " 'Great. My baseball team won the competition.\\n' # J\n",
165 | " 'Oh Congratulations, Juliet\\n' # R\n",
166 | " 'Thank you Romeo\\n' # J\n",
167 | " 'Where are you going today?\\n' # R\n",
168 | " 'I am going shopping. What about you?\\n' # J\n",
169 | " 'I am going to visit my grandmother. she is not very well' # R\n",
170 | ")\n",
171 | "sentences = re.sub(\"[.,!?\\\\-]\", '', text.lower()).split('\\n') # filter '.', ',', '?', '!'\n",
172 | "\n",
173 | "# 所有句子的单词list\n",
174 | "word_list = list(set(\" \".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]\n",
175 | "\n",
176 | "# 给单词表中所有单词设置序号\n",
177 | "word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}\n",
178 | "for i, w in enumerate(word_list):\n",
179 | " word2idx[w] = i + 4\n",
180 | "\n",
181 | "# 用于 idx 映射回 word\n",
182 | "idx2word = {i: w for i, w in enumerate(word2idx)}\n",
183 | "vocab_size = len(word2idx) # 40\n",
184 | "\n",
185 | "# token: 就是每个单词在词表中的index\n",
186 | "token_list = list() # token_list存储了每一句的token\n",
187 | "for sentence in sentences:\n",
188 | " arr = [word2idx[s] for s in sentence.split()]\n",
189 | " token_list.append(arr)"
190 | ],
191 | "execution_count": null,
192 | "outputs": []
193 | },
194 | {
195 | "cell_type": "code",
196 | "metadata": {
197 | "ExecuteTime": {
198 | "end_time": "2020-10-11T14:16:40.718513Z",
199 | "start_time": "2020-10-11T14:16:40.712136Z"
200 | },
201 | "lang": "en",
202 | "id": "5SfmKM2YHOXA",
203 | "outputId": "266df375-d9d6-4feb-fee3-64cb3b301564"
204 | },
205 | "source": [
206 | "print(sentences[1]) # hello romeo my name is juliet nice to meet you\n",
207 | "print(token_list[1]) # [14, 31, 35, 33, 27, 11, 8, 16, 5, 34]"
208 | ],
209 | "execution_count": null,
210 | "outputs": [
211 | {
212 | "output_type": "stream",
213 | "text": [
214 | "hello romeo my name is juliet nice to meet you\n",
215 | "[38, 14, 23, 15, 24, 30, 5, 13, 39, 19]\n"
216 | ],
217 | "name": "stdout"
218 | }
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {
224 | "id": "YRXMsY82HOXM"
225 | },
226 | "source": [
227 | "## 设置超参数"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "metadata": {
233 | "ExecuteTime": {
234 | "end_time": "2020-10-11T14:16:41.118824Z",
235 | "start_time": "2020-10-11T14:16:41.113460Z"
236 | },
237 | "id": "XAGo9iSDHOXN"
238 | },
239 | "source": [
240 | "maxlen = 30 # 句子pad到的最大长度,即下面句子中的seq_len\n",
241 | "batch_size = 6 \n",
242 | "\n",
243 | "max_pred = 5 # max tokens of prediction\n",
244 | "n_layers = 6 # Bert中Transformer的层数\n",
245 | "n_heads = 12 # Multi-head的数量\n",
246 | "d_model = 768 # 即embedding_dim\n",
247 | "d_ff = 768*4 # 4*d_model, FeedForward dimension\n",
248 | "d_k = d_v = 64 # dimension of K(=Q), V,是d_model分割成n_heads之后的长度, 768 // 12 = 64\n",
249 | "\n",
250 | "n_segments = 2 # 分隔句子数"
251 | ],
252 | "execution_count": null,
253 | "outputs": []
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {
258 | "id": "xYnGPYp2HOXU"
259 | },
260 | "source": [
261 | "# 实现Dataloader\n",
262 | "\n",
263 | "## 生成data\n",
264 | "\n",
265 | "- 选中语料中所有词的**15%**进行随机mask\n",
266 | "\n",
267 | "- 在确定要Mask掉的单词之后:\n",
268 | "\n",
269 | " - 选中的单词,在80%的概率下被用 [MASK] 来代替\n",
270 | " \n",
271 | " - 选中的单词,在10%的概率下不做mask,用任意非标记词代替\n",
272 | " \n",
273 | " - 选中的单词,在10%的概率下不做mask,仍然保留原来真实的词"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "metadata": {
279 | "ExecuteTime": {
280 | "end_time": "2020-10-11T14:16:41.796706Z",
281 | "start_time": "2020-10-11T14:16:41.777687Z"
282 | },
283 | "id": "iI0sULeoHOXW"
284 | },
285 | "source": [
286 | "# sample IsNext and NotNext to be same in small batch size\n",
287 | "def make_data():\n",
288 | " batch = []\n",
289 | " positive = negative = 0\n",
290 | " while (positive != batch_size / 2) or (negative != batch_size / 2):\n",
291 | " # ==========================BERT 的 input 表示================================\n",
292 | " # 随机取两个句子的index\n",
293 | " tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences\n",
294 | " # 随机取两个句子\n",
295 | " tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]\n",
296 | " # Token (没有使用word piece): 单词在词典中的编码 \n",
297 | " input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]\n",
298 | " # Segment: 区分两个句子的编码(上句全为0 (CLS~SEP),下句全为1)\n",
299 | " segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n",
300 | " \n",
301 | " # ========================== MASK LM ==========================================\n",
302 | " n_pred = min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence\n",
303 | " # token在 input_ids 中的下标(不包括[CLS], [SEP])\n",
304 | " cand_maked_pos = [i for i, token in enumerate(input_ids) \n",
305 | " if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position\n",
306 | " shuffle(cand_maked_pos)\n",
307 | " \n",
308 | " masked_tokens, masked_pos = [], [] # 被mask的tokens,被mask的tokens的索引号\n",
309 | " for pos in cand_maked_pos[:n_pred]: # 随机mask 15% 的tokens\n",
310 | " masked_pos.append(pos)\n",
311 | " masked_tokens.append(input_ids[pos])\n",
312 | " # 选定要mask的词\n",
313 | " if random() < 0.8: # 80%:被真实mask\n",
314 | " input_ids[pos] = word2idx['[MASK]']\n",
315 | " elif random() > 0.9: # 10%\n",
316 | " index = randint(0, vocab_size - 1) # random index in vocabulary\n",
317 | " while index < 4: # 不能是 [PAD], [CLS], [SEP], [MASK]\n",
318 | " index = randint(0, vocab_size - 1)\n",
319 | " input_ids[pos] = index # 10%:不做mask,用任意非标记词代替\n",
320 | " # 还有10%:不做mask,什么也不做\n",
321 | " \n",
322 | " # =========================== Paddings ========================================\n",
323 | " # input_ids全部padding到相同的长度\n",
324 | " n_pad = maxlen - len(input_ids)\n",
325 | " input_ids.extend([word2idx['[PAD]']] * n_pad)\n",
326 | " segment_ids.extend([word2idx['[PAD]']] * n_pad)\n",
327 | " \n",
328 | " # zero padding (100% - 15%) tokens\n",
329 | " if max_pred > n_pred:\n",
330 | " n_pad = max_pred - n_pred\n",
331 | " masked_tokens.extend([0] * n_pad)\n",
332 | " masked_pos.extend([0] * n_pad)\n",
333 | " \n",
334 | " # =====================batch添加数据, 让正例 和 负例 数量相同=======================\n",
335 | " if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:\n",
336 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n",
337 | " positive += 1\n",
338 | " elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:\n",
339 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n",
340 | " negative += 1\n",
341 | " \n",
342 | " return batch"
343 | ],
344 | "execution_count": null,
345 | "outputs": []
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {
350 | "ExecuteTime": {
351 | "end_time": "2020-10-11T07:12:34.813381Z",
352 | "start_time": "2020-10-11T07:12:34.807625Z"
353 | },
354 | "id": "8TdXloLHHOXd"
355 | },
356 | "source": [
357 | "调用上面函数:"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "metadata": {
363 | "ExecuteTime": {
364 | "end_time": "2020-10-11T14:16:41.987896Z",
365 | "start_time": "2020-10-11T14:16:41.980913Z"
366 | },
367 | "id": "bptQwzl7HOXh",
368 | "outputId": "fd09cb98-b360-454c-9a2d-45922f91a25f"
369 | },
370 | "source": [
371 | "batch = make_data()\n",
372 | "\n",
373 | "input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch) \n",
374 | "print(len(isNext))\n",
375 | "# # 全部要转成LongTensor类型\n",
376 | "# input_ids, segment_ids, masked_tokens, masked_pos, isNext = \\\n",
377 | "# torch.LongTensor(input_ids), torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens), \\\n",
378 | "# torch.LongTensor(masked_pos), torch.LongTensor(isNext)"
379 | ],
380 | "execution_count": null,
381 | "outputs": [
382 | {
383 | "output_type": "stream",
384 | "text": [
385 | "6\n"
386 | ],
387 | "name": "stdout"
388 | }
389 | ]
390 | },
391 | {
392 | "cell_type": "markdown",
393 | "metadata": {
394 | "id": "2nZBtL7dHOXx"
395 | },
396 | "source": [
397 | "## 生成DataLoader\n",
398 | "\n",
399 | "- 为了使用dataloader,我们需要定义以下两个function:\n",
400 | "\n",
401 | " - `__len__` function:需要返回整个数据集中有多少个item\n",
402 | " \n",
403 | " - `__get__ `:根据给定的index返回一个item\n",
404 | " \n",
405 | "有了dataloader之后,我们可以轻松随机打乱整个数据集,拿到一个batch的数据等等。"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "metadata": {
411 | "ExecuteTime": {
412 | "end_time": "2020-10-11T14:16:42.699706Z",
413 | "start_time": "2020-10-11T14:16:42.689736Z"
414 | },
415 | "id": "SrcLTns2HOX2"
416 | },
417 | "source": [
418 | "class MyDataSet(Data.Dataset):\n",
419 | " def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):\n",
420 | " # 全部要转成LongTensor类型\n",
421 | " self.input_ids = torch.LongTensor(input_ids)\n",
422 | " self.segment_ids = torch.LongTensor(segment_ids)\n",
423 | " self.masked_tokens = torch.LongTensor(masked_tokens) \n",
424 | " self.masked_pos = torch.LongTensor(masked_pos) \n",
425 | " self.isNext = torch.LongTensor(isNext)\n",
426 | " \n",
427 | " def __len__(self):\n",
428 | " return len(self.input_ids)\n",
429 | " \n",
430 | " def __getitem__(self, idx):\n",
431 | " return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]\n",
432 | " \n",
433 | "dataset = MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext)\n",
434 | "dataloader = Data.DataLoader(dataset, batch_size=batch_size, shuffle=True)"
435 | ],
436 | "execution_count": null,
437 | "outputs": []
438 | },
439 | {
440 | "cell_type": "code",
441 | "metadata": {
442 | "ExecuteTime": {
443 | "end_time": "2020-10-11T14:16:42.946111Z",
444 | "start_time": "2020-10-11T14:16:42.932823Z"
445 | },
446 | "scrolled": false,
447 | "id": "4OpB-jmyHOYC",
448 | "outputId": "82008c10-e44d-4636-c45e-08870a72ff5d"
449 | },
450 | "source": [
451 | "print(next(iter(dataloader)))\n",
452 | "print(len(dataloader)) # 就一个batch"
453 | ],
454 | "execution_count": null,
455 | "outputs": [
456 | {
457 | "output_type": "stream",
458 | "text": [
459 | "[tensor([[ 1, 36, 23, 9, 16, 33, 3, 18, 2, 31, 21, 30, 2, 0, 0, 0, 0, 0,\n",
460 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
461 | " [ 1, 36, 23, 9, 16, 33, 3, 18, 2, 22, 8, 6, 13, 28, 23, 34, 3, 24,\n",
462 | " 11, 27, 37, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n",
463 | " [ 1, 22, 8, 6, 3, 35, 12, 19, 2, 5, 13, 39, 19, 10, 25, 26, 19, 17,\n",
464 | " 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
465 | " [ 1, 38, 14, 23, 15, 24, 30, 5, 13, 39, 19, 2, 38, 14, 23, 15, 24, 30,\n",
466 | " 5, 13, 3, 19, 2, 0, 0, 0, 0, 0, 0, 0],\n",
467 | " [ 1, 29, 26, 19, 6, 3, 2, 22, 8, 6, 32, 35, 3, 19, 2, 0, 0, 0,\n",
468 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
469 | " [ 1, 38, 14, 23, 15, 24, 30, 5, 13, 39, 19, 2, 5, 13, 39, 19, 10, 25,\n",
470 | " 3, 19, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0]]), tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
471 | " 0, 0, 0, 0, 0, 0],\n",
472 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,\n",
473 | " 0, 0, 0, 0, 0, 0],\n",
474 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,\n",
475 | " 0, 0, 0, 0, 0, 0],\n",
476 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,\n",
477 | " 0, 0, 0, 0, 0, 0],\n",
478 | " [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
479 | " 0, 0, 0, 0, 0, 0],\n",
480 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,\n",
481 | " 0, 0, 0, 0, 0, 0]]), tensor([[20, 0, 0, 0, 0],\n",
482 | " [ 4, 20, 23, 0, 0],\n",
483 | " [12, 32, 0, 0, 0],\n",
484 | " [14, 14, 39, 0, 0],\n",
485 | " [17, 12, 0, 0, 0],\n",
486 | " [17, 39, 26, 0, 0]]), tensor([[ 6, 0, 0, 0, 0],\n",
487 | " [16, 6, 14, 0, 0],\n",
488 | " [ 6, 4, 0, 0, 0],\n",
489 | " [ 2, 13, 20, 0, 0],\n",
490 | " [ 5, 12, 0, 0, 0],\n",
491 | " [20, 9, 18, 0, 0]]), tensor([1, 0, 0, 0, 1, 1])]\n",
492 | "1\n"
493 | ],
494 | "name": "stdout"
495 | }
496 | ]
497 | },
498 | {
499 | "cell_type": "markdown",
500 | "metadata": {
501 | "ExecuteTime": {
502 | "end_time": "2020-10-11T11:40:34.174618Z",
503 | "start_time": "2020-10-11T11:40:34.168686Z"
504 | },
505 | "id": "vci2w21zHOYO"
506 | },
507 | "source": [
508 | "# Bert模型\n",
509 | "\n",
510 | "## Embedding"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "metadata": {
516 | "ExecuteTime": {
517 | "end_time": "2020-10-11T14:16:43.711528Z",
518 | "start_time": "2020-10-11T14:16:43.699319Z"
519 | },
520 | "id": "zLE6v5d5HOYT"
521 | },
522 | "source": [
523 | "class BertEmbedding(nn.Module):\n",
524 | " def __init__(self):\n",
525 | " super(BertEmbedding, self).__init__()\n",
526 | " # d_model:即embedding_dim\n",
527 | " # token embedding\n",
528 | " self.tok_embed = nn.Embedding(vocab_size, d_model) \n",
529 | "\n",
530 | " # position embedding: 这里简写了,源码中位置编码使用了sin,cos\n",
531 | "# self.pos_embed = nn.Embedding(maxlen, d_model) \n",
532 | " self.pos_embed = torch.tensor(\n",
533 | " [[pos / (10000.0 ** (i // 2 * 2.0 / d_model)) for i in range(d_model)] for pos in range(maxlen)]\n",
534 | " )\n",
535 | " self.pos_embed[:, 0::2] = torch.sin(self.pos_embed[:, 0::2])\n",
536 | " self.pos_embed[:, 1::2] = torch.cos(self.pos_embed[:, 1::2])\n",
537 | " \n",
538 | " # segment embedding\n",
539 | " self.seg_embed = nn.Embedding(n_segments, d_model) # segment(token type) embedding\n",
540 | "\n",
541 | " # LayerNorm\n",
542 | " self.norm = nn.LayerNorm(d_model)\n",
543 | " \n",
544 | " def forward(self, x, seq): # x 和 pos的shape 都是[batch_size, seq_len]\n",
545 | "\n",
546 | "# seq_len = x.size(1) \n",
547 | "# pos = torch.arange(seq_len, dtype=torch.long)\n",
548 | " # unsqueeze(0): 在索引0处,增加维度--> [1, seq_len]\n",
549 | " # expand: 某个 size=1 的维度上扩展到size\n",
550 | " # expand_as: 把一个tensor变成和函数括号内一样形状的tensor\n",
551 | "# pos = pos.unsqueeze(0).expand_as(x) # [seq_len] -> [batch_size, seq_len]\n",
552 | " \n",
553 | " # 三个embedding相加\n",
554 | " input_embedding = self.tok_embed(x) + nn.Parameter(self.pos_embed, requires_grad=False) + self.seg_embed(seq)\n",
555 | " \n",
556 | " return self.norm(input_embedding)"
557 | ],
558 | "execution_count": null,
559 | "outputs": []
560 | },
561 | {
562 | "cell_type": "markdown",
563 | "metadata": {
564 | "ExecuteTime": {
565 | "end_time": "2020-10-11T12:53:38.450646Z",
566 | "start_time": "2020-10-11T12:53:38.427512Z"
567 | },
568 | "id": "mMIuvYhCHOYh"
569 | },
570 | "source": [
571 | "## 生成mask"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "metadata": {
577 | "ExecuteTime": {
578 | "end_time": "2020-10-11T14:16:44.304800Z",
579 | "start_time": "2020-10-11T14:16:44.299614Z"
580 | },
581 | "id": "saCDe7HPHOYw"
582 | },
583 | "source": [
584 | "# Padding的部分不应该计算概率,所以需要在相应位置设置mask\n",
585 | "# mask==0的内容填充1e-9,使得计算softmax时概率接近0\n",
586 | "# 在计算attention时,使用\n",
587 | "def get_attn_pad_mask(seq_q, seq_k): # seq_q 和 seq_k 的 shape 都是 [batch_size, seq_len]\n",
588 | " batch_size, seq_len = seq_q.size()\n",
589 | " # eq(zero) is PAD token\n",
590 | " pad_attn_mask = seq_q.data.eq(0).unsqueeze(1) # [batcb_size, 1, seq_len]\n",
591 | " return pad_attn_mask.expand(batch_size, seq_len, seq_len) # [batch_size, seq_len, seq_len]"
592 | ],
593 | "execution_count": null,
594 | "outputs": []
595 | },
596 | {
597 | "cell_type": "markdown",
598 | "metadata": {
599 | "id": "DaYxEVEoHOY8"
600 | },
601 | "source": [
602 | "## 构建激活函数"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "metadata": {
608 | "ExecuteTime": {
609 | "end_time": "2020-10-11T14:16:45.236139Z",
610 | "start_time": "2020-10-11T14:16:45.231847Z"
611 | },
612 | "id": "lButqpZMHOY-"
613 | },
614 | "source": [
615 | "def gelu(x):\n",
616 | " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))"
617 | ],
618 | "execution_count": null,
619 | "outputs": []
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {
624 | "id": "1lNlLYXHHOZF"
625 | },
626 | "source": [
627 | "## 缩放点乘注意力计算\n",
628 | "\n",
629 | "- $self-att(Q,K,V) = V \\cdot softmax(\\frac{K^T \\cdot Q}{\\sqrt{D_k}}$)"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "metadata": {
635 | "ExecuteTime": {
636 | "end_time": "2020-10-11T14:16:46.062685Z",
637 | "start_time": "2020-10-11T14:16:46.053866Z"
638 | },
639 | "id": "zRBsXE3vHOZH"
640 | },
641 | "source": [
642 | " class ScaledDotProductAttention(nn.Module): \n",
643 | " \"\"\"\n",
644 | " Scaled Dot-Product Attention\n",
645 | " \"\"\"\n",
646 | " def __init__(self):\n",
647 | " super(ScaledDotProductAttention, self).__init__()\n",
648 | " \n",
649 | " def forward(self, Q, K, V, attn_mask):\n",
650 | " \"\"\"\n",
651 | " Args:\n",
652 | " Q: [batch_size, n_heads, seq_len, d_k]\n",
653 | " K: [batch_size, n_heads, seq_len, d_k]\n",
654 | " V: [batch_size, n_heads, seq_len, d_k]\n",
655 | " Return:\n",
656 | " self-attention后的张量,以及attention张量\n",
657 | " \"\"\"\n",
658 | " # [batch_size, n_heads, seq_len, d_k] * [batch_size, n_heads, d_k, seq_len] = [batch_size, n_heads, seq_len, seq_len]\n",
659 | " score = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)\n",
660 | " \n",
661 | " # mask==0 is PAD token\n",
662 | " # 我们需要防止解码器中的向左信息流来保持自回归属性。 通过屏蔽softmax的输入中所有不合法连接的值(设置为-∞)\n",
663 | " score = score.masked_fill_(attn_mask, -1e9) # mask==0的内容填充-1e9,使得计算softmax时概率接近0\n",
664 | " \n",
665 | " attention = F.softmax(score, dim = -1) # [bz, n_hs, seq_len, seq_len]\n",
666 | " context = torch.matmul(attention, V) # [batch_size, n_heads, seq_len, d_k]\n",
667 | " \n",
668 | " return context"
669 | ],
670 | "execution_count": null,
671 | "outputs": []
672 | },
673 | {
674 | "cell_type": "markdown",
675 | "metadata": {
676 | "id": "JlRcTyfzHOZQ"
677 | },
678 | "source": [
679 | "## Multi-Head Attention"
680 | ]
681 | },
682 | {
683 | "cell_type": "code",
684 | "metadata": {
685 | "ExecuteTime": {
686 | "end_time": "2020-10-11T14:16:47.009304Z",
687 | "start_time": "2020-10-11T14:16:46.994583Z"
688 | },
689 | "id": "-8xKRvdBHOZS"
690 | },
691 | "source": [
692 | "class MultiHeadAttention(nn.Module):\n",
693 | " def __init__(self):\n",
694 | " super(MultiHeadAttention, self).__init__()\n",
695 | " self.W_Q = nn.Linear(d_model, d_k * n_heads) # 其实就是[d_model, d_model]\n",
696 | " self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
697 | " self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
698 | "\n",
699 | " def forward(self, Q, K, V, attn_mask): # Q和K: [batch_size, seq_len, d_model], V: [batch_size, seq_len, d_model], attn_mask: [batch_size, seq_len, seq_len]\n",
700 | " residual, batch_size = Q, Q.size(0)\n",
701 | " # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n",
702 | " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s: [batch_size, n_heads, seq_len, d_k]\n",
703 | " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) # k_s: [batch_size, n_heads, seq_len, d_k]\n",
704 | " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) # v_s: [batch_size, n_heads, seq_len, d_v]\n",
705 | "\n",
706 | " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]\n",
707 | "\n",
708 | " # context: [batch_size, n_heads, seq_len, d_v], attn_mask: [batch_size, n_heads, seq_len, seq_len]\n",
709 | " context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n",
710 | " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads, d_v]\n",
711 | " \n",
712 | " output = nn.Linear(n_heads * d_v, d_model)(context)\n",
713 | " \n",
714 | " return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]"
715 | ],
716 | "execution_count": null,
717 | "outputs": []
718 | },
719 | {
720 | "cell_type": "markdown",
721 | "metadata": {
722 | "id": "ppWRazRTHOZY"
723 | },
724 | "source": [
725 | "## 前向传播\n",
726 | "\n",
727 | "- Position_wise_Feed_Forward"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "metadata": {
733 | "ExecuteTime": {
734 | "end_time": "2020-10-11T14:16:47.874045Z",
735 | "start_time": "2020-10-11T14:16:47.867372Z"
736 | },
737 | "id": "35A60BrcHOZa"
738 | },
739 | "source": [
740 | "class PoswiseFeedForwardNet(nn.Module): # 前向传播,线性激活再线性\n",
741 | " def __init__(self):\n",
742 | " super(PoswiseFeedForwardNet, self).__init__()\n",
743 | " self.fc1 = nn.Linear(d_model, d_ff)\n",
744 | " self.fc2 = nn.Linear(d_ff, d_model)\n",
745 | "\n",
746 | " def forward(self, x):\n",
747 | " # [batch_size, seq_len, d_model] -> [batch_size, seq_len, d_ff] -> [batch_size, seq_len, d_model]\n",
748 | " return self.fc2(gelu(self.fc1(x)))"
749 | ],
750 | "execution_count": null,
751 | "outputs": []
752 | },
753 | {
754 | "cell_type": "markdown",
755 | "metadata": {
756 | "id": "ZcC2rYvmHOZj"
757 | },
758 | "source": [
759 | "## 编码层EncoderLayer\n",
760 | "\n",
761 | "源码中 `Bidirectional Encoder = Transformer (self-attention)`\n",
762 | "\n",
763 | "`Transformer = MultiHead_Attention + Feed_Forward with sublayer connection`,下面代码省去了sublayer。"
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "metadata": {
769 | "ExecuteTime": {
770 | "end_time": "2020-10-11T14:16:48.545383Z",
771 | "start_time": "2020-10-11T14:16:48.538822Z"
772 | },
773 | "id": "Ut0dr87YHOZl"
774 | },
775 | "source": [
776 | "class EncoderLayer(nn.Module): #多头注意力和前向传播的组合\n",
777 | " def __init__(self):\n",
778 | " super(EncoderLayer, self).__init__()\n",
779 | " self.enc_self_attn = MultiHeadAttention()\n",
780 | " self.pos_ffn = PoswiseFeedForwardNet()\n",
781 | "\n",
782 | " def forward(self, enc_inputs, enc_self_attn_mask):\n",
783 | " enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n",
784 | " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]\n",
785 | " return enc_outputs"
786 | ],
787 | "execution_count": null,
788 | "outputs": []
789 | },
790 | {
791 | "cell_type": "markdown",
792 | "metadata": {
793 | "id": "fisjyyYVHOZx"
794 | },
795 | "source": [
796 | "## BERT模型"
797 | ]
798 | },
799 | {
800 | "cell_type": "code",
801 | "metadata": {
802 | "ExecuteTime": {
803 | "end_time": "2020-10-11T14:16:49.113277Z",
804 | "start_time": "2020-10-11T14:16:49.098177Z"
805 | },
806 | "id": "zEmwiYC0HOZz"
807 | },
808 | "source": [
809 | "class BERT(nn.Module):\n",
810 | " def __init__(self):\n",
811 | " super(BERT, self).__init__()\n",
812 | " self.embedding = BertEmbedding()\n",
813 | " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
814 | " self.fc = nn.Sequential(\n",
815 | " nn.Linear(d_model, d_model),\n",
816 | " nn.Dropout(0.5),\n",
817 | " nn.Tanh(),\n",
818 | " )\n",
819 | " self.classifier = nn.Linear(d_model, 2)\n",
820 | " self.linear = nn.Linear(d_model, d_model)\n",
821 | " self.activ2 = gelu\n",
822 | " # fc2 is shared with embedding layer\n",
823 | " embed_weight = self.embedding.tok_embed.weight \n",
824 | " self.fc2 = nn.Linear(d_model, vocab_size, bias=False)\n",
825 | " self.fc2.weight = embed_weight\n",
826 | "\n",
827 | " # input_ids和segment_ids的shape[batch_size, seq_len],masked_pos的shape是[batch_size, max_pred]\n",
828 | " def forward(self, input_ids, segment_ids, masked_pos): \n",
829 | " output = self.embedding(input_ids, segment_ids) # [bach_size, seq_len, d_model]\n",
830 | "\n",
831 | " enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)# [batch_size, seq_len, seq_len]\n",
832 | " for layer in self.layers: # 这里对layers遍历,相当于源码中多个transformer_blocks\n",
833 | " output = layer(output, enc_self_attn_mask) # output: [batch_size, seq_len, d_model]\n",
834 | "\n",
835 | " # it will be decided by first token(CLS)\n",
836 | " h_pooled = self.fc(output[:, 0]) # [batch_size, d_model]\n",
837 | " logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext\n",
838 | "\n",
839 | " masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]\n",
840 | " h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]\n",
841 | " h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]\n",
842 | " logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]\n",
843 | " \n",
844 | " # logits_lm: [batch_size, max_pred, vocab_size], logits_clsf: [batch_size, 2]\n",
845 | " return logits_lm, logits_clsf "
846 | ],
847 | "execution_count": null,
848 | "outputs": []
849 | },
850 | {
851 | "cell_type": "markdown",
852 | "metadata": {
853 | "id": "WU01ritBHOZ7"
854 | },
855 | "source": [
856 | "## 定义模型"
857 | ]
858 | },
859 | {
860 | "cell_type": "code",
861 | "metadata": {
862 | "ExecuteTime": {
863 | "end_time": "2020-10-11T14:16:50.299667Z",
864 | "start_time": "2020-10-11T14:16:49.733700Z"
865 | },
866 | "id": "T08whxgSHOZ8"
867 | },
868 | "source": [
869 | "model = BERT()\n",
870 | "criterion = nn.CrossEntropyLoss()\n",
871 | "optimizer = optim.Adadelta(model.parameters(), lr=0.001)"
872 | ],
873 | "execution_count": null,
874 | "outputs": []
875 | },
876 | {
877 | "cell_type": "markdown",
878 | "metadata": {
879 | "id": "psNxVxzQHOaF"
880 | },
881 | "source": [
882 | "# 训练模型"
883 | ]
884 | },
885 | {
886 | "cell_type": "code",
887 | "metadata": {
888 | "ExecuteTime": {
889 | "end_time": "2020-10-11T14:17:14.367211Z",
890 | "start_time": "2020-10-11T14:16:50.501600Z"
891 | },
892 | "id": "jIeAVzETHOaG",
893 | "outputId": "8de8420e-1b36-43d1-f976-25cb1f9f35de"
894 | },
895 | "source": [
896 | "for epoch in range(50):\n",
897 | " for input_ids, segment_ids, masked_tokens, masked_pos, isNext in dataloader:\n",
898 | " \n",
899 | " # logits_lm: [batch_size, max_pred, vocab_size]\n",
900 | " # logits_clsf: [batch_size, 2]\n",
901 | " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos) \n",
902 | " \n",
903 | " loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # for masked LM\n",
904 | " loss_lm = (loss_lm.float()).mean()\n",
905 | " \n",
906 | " loss_clsf = criterion(logits_clsf, isNext) # for sentence classification\n",
907 | " loss = loss_lm + loss_clsf\n",
908 | " \n",
909 | " if (epoch + 1) % 10 == 0:\n",
910 | " print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))\n",
911 | " \n",
912 | " optimizer.zero_grad()\n",
913 | " loss.backward()\n",
914 | " optimizer.step()"
915 | ],
916 | "execution_count": null,
917 | "outputs": [
918 | {
919 | "output_type": "stream",
920 | "text": [
921 | "Epoch: 0010 loss = 1.908749\n",
922 | "Epoch: 0020 loss = 1.354349\n",
923 | "Epoch: 0030 loss = 1.131212\n",
924 | "Epoch: 0040 loss = 1.091269\n",
925 | "Epoch: 0050 loss = 0.891469\n"
926 | ],
927 | "name": "stdout"
928 | }
929 | ]
930 | },
931 | {
932 | "cell_type": "markdown",
933 | "metadata": {
934 | "id": "a-veXTwwHOaM"
935 | },
936 | "source": [
937 | "# 预测"
938 | ]
939 | },
940 | {
941 | "cell_type": "code",
942 | "metadata": {
943 | "ExecuteTime": {
944 | "end_time": "2020-10-11T14:19:14.300430Z",
945 | "start_time": "2020-10-11T14:19:14.294011Z"
946 | },
947 | "id": "K8LHxRe6HOaN",
948 | "outputId": "090a7028-9df7-4b05-d18a-7013c47700d0"
949 | },
950 | "source": [
951 | "input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[1]\n",
952 | "print(text)\n",
953 | "print('================================')\n",
954 | "print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])"
955 | ],
956 | "execution_count": null,
957 | "outputs": [
958 | {
959 | "output_type": "stream",
960 | "text": [
961 | "Hello, how are you? I am Romeo.\n",
962 | "Hello, Romeo My name is Juliet. Nice to meet you.\n",
963 | "Nice to meet you too. How are you today?\n",
964 | "Great. My baseball team won the competition.\n",
965 | "Oh Congratulations, Juliet\n",
966 | "Thank you Romeo\n",
967 | "Where are you going today?\n",
968 | "I am going shopping. What about you?\n",
969 | "I am going to visit my grandmother. she is not very well\n",
970 | "================================\n",
971 | "['[CLS]', 'great', 'my', 'baseball', 'team', 'won', '[MASK]', 'competition', '[SEP]', 'i', 'am', 'going', 'to', 'visit', 'my', 'grandmother', '[MASK]', 'is', 'not', 'very', 'well', '[SEP]']\n"
972 | ],
973 | "name": "stdout"
974 | }
975 | ]
976 | },
977 | {
978 | "cell_type": "code",
979 | "metadata": {
980 | "ExecuteTime": {
981 | "end_time": "2020-10-11T14:36:53.929639Z",
982 | "start_time": "2020-10-11T14:36:53.831314Z"
983 | },
984 | "id": "p5hYK0CAHOaW",
985 | "outputId": "debd5cb6-9c45-4e93-8dda-1a3d11e66d03"
986 | },
987 | "source": [
988 | "logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), torch.LongTensor([segment_ids]), \n",
989 | " torch.LongTensor([masked_pos])) # batch=1\n",
990 | "# vocab_size维上求max, 输出最大值的索引,第一个batch的max_pred个输出\n",
991 | "logits_lm = logits_lm.data.max(2)[1][0].data.numpy()\n",
992 | "print('masked tokens list: ', [pos for pos in masked_tokens if pos != 0])\n",
993 | "print('predict masked tokens list: ', [pos for pos in logits_lm if pos != 0])"
994 | ],
995 | "execution_count": null,
996 | "outputs": [
997 | {
998 | "output_type": "stream",
999 | "text": [
1000 | "masked tokens list: [4, 20, 23]\n",
1001 | "predict masked tokens list: [26, 20, 23]\n"
1002 | ],
1003 | "name": "stdout"
1004 | }
1005 | ]
1006 | },
1007 | {
1008 | "cell_type": "code",
1009 | "metadata": {
1010 | "ExecuteTime": {
1011 | "end_time": "2020-10-11T14:37:00.270089Z",
1012 | "start_time": "2020-10-11T14:37:00.262266Z"
1013 | },
1014 | "id": "crYtY_iiHOah",
1015 | "outputId": "e329e8ca-4248-459d-90df-ea168743835d"
1016 | },
1017 | "source": [
1018 | "pred = logits_clsf.data.max(1)[1].data.numpy()[0]\n",
1019 | "print('isNext : ', True if isNext else False)\n",
1020 | "print('predict isNext :', True if pred else False)"
1021 | ],
1022 | "execution_count": null,
1023 | "outputs": [
1024 | {
1025 | "output_type": "stream",
1026 | "text": [
1027 | "isNext : False\n",
1028 | "predict isNext : False\n"
1029 | ],
1030 | "name": "stdout"
1031 | }
1032 | ]
1033 | },
1034 | {
1035 | "cell_type": "code",
1036 | "metadata": {
1037 | "id": "hw3ZYr8PHOas"
1038 | },
1039 | "source": [
1040 | ""
1041 | ],
1042 | "execution_count": null,
1043 | "outputs": []
1044 | },
1045 | {
1046 | "cell_type": "code",
1047 | "metadata": {
1048 | "id": "gTDAED7mHOa2"
1049 | },
1050 | "source": [
1051 | ""
1052 | ],
1053 | "execution_count": null,
1054 | "outputs": []
1055 | },
1056 | {
1057 | "cell_type": "code",
1058 | "metadata": {
1059 | "id": "FBbBrn4wHObD"
1060 | },
1061 | "source": [
1062 | ""
1063 | ],
1064 | "execution_count": null,
1065 | "outputs": []
1066 | },
1067 | {
1068 | "cell_type": "code",
1069 | "metadata": {
1070 | "id": "ZcWaUF_jHObM"
1071 | },
1072 | "source": [
1073 | ""
1074 | ],
1075 | "execution_count": null,
1076 | "outputs": []
1077 | },
1078 | {
1079 | "cell_type": "code",
1080 | "metadata": {
1081 | "id": "67nTNfB1HObV"
1082 | },
1083 | "source": [
1084 | ""
1085 | ],
1086 | "execution_count": null,
1087 | "outputs": []
1088 | }
1089 | ]
1090 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLP-Project
2 |
3 | - lstm 情感分析 pytorch
4 |
--------------------------------------------------------------------------------
/nmt/en-cn/dev_mini.txt:
--------------------------------------------------------------------------------
1 | She put the magazine on the table. 她把雜誌放在桌上。
2 | Hey, what are you doing here? 嘿,你在這做什麼?
3 | Please keep this secret. 請保守這個秘密。
4 | How could things get worse? 事情怎麼變糟的?
5 | Kyoto and Boston are sister cities. 京都和波士顿是姐妹城市。
6 | Tom mostly kept to himself. 汤姆大多一个人独处。
7 | Do you like music? 你爱音乐吗?
8 | Tell me the reason why she got angry. 告诉我她为什么生气。
9 | When does the game begin? 游戏几点开始?
10 | I passed by her house yesterday. 我昨天經過她家。
11 | Are you all ready? 你們都準備好了嗎?
12 | He's not very strict about this. 他在这方面不是很严格。
13 | She is fond of singing old songs. 她喜歡唱老歌。
14 | I wish to climb Mt. Fuji again. 我希望再爬一次上富士山。
15 | More than twenty boys went there. 超過二十個男孩去了那裡。
16 | Any emotion, if it is sincere, is involuntary. 任何情绪,只要它是真诚的,就说明它是发自内心的自然流露。
17 | How long did you live there? 你住在那裡多久了?
18 | I don't understand German. 我不懂德语。
19 | I don't like her face. 我不喜歡她的臉。
20 | He substituted for his father. 子承父业。
21 | She plays the piano very well. 她鋼琴彈得很好。
22 | You are no better at remembering things than I am. 你记事情的能力并不比我好多少。
23 | Would you lend me some money? 你可以借我一些錢嗎?
24 | She hit the ball hard. 她用力地拍了球。
25 | Please serve him his meal first. 請先為他上菜。
26 | He had kept the secret to himself. 他保守著這個秘密。
27 | He graduated from Harvard University with honors. 他光榮地從哈佛大學畢業了。
28 | Tom started dating Mary three months ago. 汤姆三个月前开始和玛丽约会。
29 | All my homework is done. 我做完了所有的回家作業。
30 | My uncle never writes letters. 我的叔叔從來不寫信。
31 | It's too late to apologize. 现在道歉也迟了。
32 | I gave you a book. 我给了你一本书。
33 | Could you teach me how to play the piano? 您能教我弹钢琴吗?
34 | I only spent three dollars. 我只花了三美元。
35 | I'm sleepy. 我困了。
36 | You're really beautiful. 你真的很漂亮。
37 | Whether you succeed or not depends on your own efforts. 你成功与否取决于你自身的努力。
38 | That's what Tom requested. 那就是汤姆想要的。
39 | Have you been to London before? 你以前去過倫敦嗎?
40 | He doesn't speak our language. 他不會說我們的語言。
41 | I have a computer. 我有一台电脑。
42 | Turn right at the next corner. 在下一個轉角右轉。
43 | I ate caviar. 我吃了魚子醬。
44 | She works as a massage therapist. 她是按摩师。
45 | What do you usually eat for lunch? 你一般中午饭吃什么?
46 | That's a bad day for me. 那天我不行。
47 | I like sports. 我喜歡運動。
48 | She may come. 她可以來。
49 | We expect a lot from him. 我們對他期望很多。
50 | I took a picture of my family. 我為我的家人拍了照片。
51 | She greeted me with a smile. 她用一个微笑迎接了我。
52 | Every time I went to his place, he was studying. 每次我去他住處,他都在讀書。
53 | Only four horses were in the race. 只有四匹馬參加了比賽。
54 | That is your book. 那是你的書。
55 | Tom had a heart attack last year. 汤姆去年得了心脏病。
56 | Any comments are welcome. 欢迎作任何评论。
57 | She spread the butter on the bread. 她把奶油塗在麵包上。
58 | Don't make fun of people. 不要取笑人。
59 | Karuizawa is famous as a summer resort. 輕井澤是著名的避暑勝地。
60 | What's your favorite home-cooked food? 你最喜歡的家常菜是什麼?
61 | It makes me feel sad. 这让我感到沮丧。
62 | This car handles very easily. 这车容易开。
63 | I've never underestimated Tom. 我从没低估汤姆。
64 | Would you please have a look at these papers? 請你看看這些文件。
65 | His life after retirement was unhappy. 他退休後的生活不快樂。
66 | The plane made a perfect landing. 這架飛機完美的著陸了。
67 | It all depends on the weather. 一切都取決於天氣。
68 | We're very different. 我们很不一样。
69 | It is definite that he will go to America. 他肯定要去美国。
70 | I threw away my shoes. 我把自己的鞋子扔掉了。
71 | I'm so excited. 我很激动。
72 | Had he known what was about to happen, he would have changed his plan. 要是他知道会发生什么,他就会改变计划。
73 | You have nice skin. 你的皮膚真好。
74 | My heart was filled with happiness. 我心里充满着快乐。
75 | I didn't see him. 我没见到他。
76 | My parents usually speak to each other in French, even though my mother is a native English speaker. 我父母通常用法语对话,即使我母亲的母语是英语。
77 | Which do you like better, spring or autumn? 春天和秋天,你更喜欢哪个?
78 | You've been late for school more often than before. 你比以前更容易上課遲到了。
79 | You're partially correct. 你部分正确。
80 | I will give you whatever you want. 我會給你任何你想要的東西。
81 | Women tend to live longer than men. 女人往往比男人活得更長。
82 | They don't seem to be Americans. 他們似乎不是美國人。
83 | She laid the work on him. 她派他去工作了。
84 | They were listening to the radio. 他們在聽收音機。
85 | I always thought that Tom was a bit different. 我以前总认为汤姆是有一些与众不同的。
86 | He can speak French, and obviously English. 他能说法语,很明显还有英语。
87 | Everyone admired his courage. 每個人都佩服他的勇氣。
88 | He did not think he needed their protection. 他认为他不需要他们的保护。
89 | What are you staring at? 你在看什麼?
90 | Give him an inch and he'll take a yard. 得寸进尺。
91 | The cough syrup has a licorice flavoring. 咳嗽糖浆有股甘草的味道。
92 | The meat has gone bad. 這肉已經壞了。
93 | Six divided by two is three. 六除以二得三。
94 | Ask him if he can speak Japanese. 问问他会不会说日语。
95 | If you see a mistake, then please correct it. 如果你发现错误,那就请你纠正它。
96 | What time is it? 几点了?
97 | Come here by ten at the latest. 最晚十點前來這裡。
98 | Why did you show me this? 你为什么让我看这个?
99 | I'm free tonight. 我今晚有空。
100 | I want to be the one who decides. 我想成为决策的人。
101 | Tom has been struck by lightning three times. 湯姆被閃電擊中過三次。
102 | Here is your book. 這是你的書。
103 | Tom, I want to have a chat with you. Tom,我想和你談談。
104 | Tom wrote his name in the sand with a stick. 汤姆用棍子在沙上写了他的名字。
105 | It's as cold as ice. 它冷得像冰一樣。
106 | My brother gave me a cute doll. 我哥哥給了我一個可愛的娃娃。
107 | I was a student at that time. 我当时是学生。
108 | It was not long before we met again by chance. 没多久,我们又碰巧遇到了。
109 | Everybody will die. 人固有一死。
110 | She had the nerve to speak out. 她竟敢說出來。
111 | She was holding a small parasol in her hand. 她手里握着一把小阳伞。
112 | He passed the entrance examination. 他通過了入學考試。
113 | A friend of mine is studying abroad. 我有一位朋友在國外留學。
114 | I get along with my younger brother. 我與我的弟弟相處融洽。
115 | His house is somewhere about here. 他家在这儿某处。
116 | This ticket is good for three days. 這張票的有效期是三天。
117 | Do you have any smaller sizes? 你有任何比較小的尺寸嗎?
118 | I gave careful consideration to the problem. 我仔細地考慮了這個問題。
119 | Answer the question. 回答問題。
120 | I must help my mother. 我必須幫忙我母親。
121 | What's your home address? 你家的地址是什麼?
122 | The prince fell in love with a woodcutter's daughter. 王子愛上了一個樵夫的女兒。
123 | I think you're a really nice guy. 我認為你真的是一個好人。
124 | A burglar broke into his house. 一個竊賊闖進了他的房子。
125 | Jealousy was the motive for the murder. 嫉妒是謀殺的動機。
126 | My sister became a college student. 我妹妹成為了一個大學生。
127 | They are at lunch. 他们在吃午饭。
128 | You should know that's impossible. 你应该知道这是不可能的。
129 | I forgot it in the garage. 我把它忘在车库里了。
130 | I do not allow sleeping in class. 我不允许有人在课上睡觉。
131 | Tom has a lot of experience in computers. 汤姆对电脑有很多经验。
132 | It cost me ten thousand yen to have my television set repaired. 把我的電視機修好花了我一萬日元。
133 | I made some mistakes on the test. 我在考试时犯了些错。
134 | I have already finished the job. 我已經完成了這項工作。
135 | Do you have something to say? 您有什么事要说吗?
136 | They sank ten enemy ships. 他们使10艘敌船沉了
137 | Everything went according to plan. 一切按計劃進行。
138 | Is one thousand yen enough? 1000日元够不够?
139 | Move the chair nearer to the desk. 把椅子挪一挪靠近桌子。
140 | He had no luck in finding work. 他不幸找不到工作。
141 | This chair is ugly. 這把椅子很醜。
142 | He doesn't realise that he's tone deaf. 他不知道他自己五音不全。
143 | He barely passed the examination. 他勉強地通過了考試。
144 | I don't want any excuses. 我不想听解释。
145 | Don't waste your time. 别浪费时间。
146 | He looks a bit tired, doesn't he? 他看起來有點累,不是嗎?
147 | My brother is a high school student. 我哥哥是個高中生。
148 | It's pretty heavy. 它真重。
149 | Tom didn't write back to Mary. 汤姆没给玛丽写回复。
150 | The meaning of this sentence is obscure. 这句句子意思模糊。
151 | We'll be in Boston for another three weeks. 我們還會在波士頓待三個月。
152 | It would take forever for me to explain everything. 要都解释的话,需要一辈子的时间。
153 | It would take me too much time to explain to you why it's not going to work. 给你解释这为什么行不通要花很多时间。
154 | I try not to think about it. 我試著不去想了。
155 | I don't worry about the risk. 我不担心风险。
156 | Tom was there physically, but not mentally. 汤姆人在心不在。
157 | Is that better? 那更好吗?
158 | The United States borders Canada. 美国与加拿大相邻。
159 | Let's take a ten-minute break. 讓我們休息10分鐘。
160 | I'm as hungry as a horse. 我餓得像匹馬。
161 | So you give up, right? 所以你放弃了,是吗?
162 | I'd like a glass of water, please. 請給我一杯水。
163 | I hear they're pretty good. 我听说他们挺好。
164 | I would like fruit juice. 我想要果汁。
165 | He's accustomed to traveling. 他習慣了旅行。
166 | I started thinking about Tom. 我开始想起汤姆。
167 | Many attended his funeral. 很多人都参加了他的葬礼。
168 | My baby began crying, asking for milk. 我的宝宝开始哭了,他想要吃奶。
169 | Don't walk alone after dark. 不要一個人在黑暗中走。
170 | He ate all of the apple. 他吃了所有的蘋果。
171 | We'll save a seat for you. 我们会给你留个位置。
172 | Training will be provided. 会有训练。
173 | It was been raining since early morning. 從清晨開始一直下雨。
174 | My parents' generation went through the war. 我父母那一代经历过战争。
175 | Please be quiet, everybody. 请大家都保持安静。
176 | His little brother is a famous soccer player. 他弟弟是个有名的足球选手。
177 | Let's go by bus. 讓我們坐公共汽車去。
178 | What's that bird called? 那隻鳥叫什麼名字?
179 | Would you like to come? 你愿意来吗?
180 | We want to hear it. 我们想听听。
181 | I'm planning to study tonight. 我打算今天晚上讀書。
182 | There was scarcely any money left. 几乎没剩下钱。
183 | You can drive a car, can't you? 你會開車,不是嗎?
184 | Tears rolled down my cheeks. 泪水沿着我的面颊流了下来。
185 | He suddenly became very happy. 他突然變得非常開心。
186 | I am sixteen years old. 我16岁了。
187 | I had a nice chat with her. 我和她聊得很愉快。
188 | I tried to convince Tom to come home. 我试着说服汤姆回家。
189 | I don't know how to handle children. 我不知道如何對待孩子。
190 | He has been to many places. 他去过很多地方。
191 | I put the money into the safe. 我把錢放入保險櫃裡。
192 | Someone is watching you. 有人在看著你。
193 | Nobody had ever heard of it. 以前從來沒有人聽說過。
194 | "He'd like to have a coffee after work." "I would too." "他想在下班後喝杯咖啡。" "我也想。"
195 | There are not many books on these shelves. 这些书架上没有很多书。
196 | Spring is my favorite season. 春天是我最喜爱的季节。
197 | You don't go to school on Sunday, do you? 你週日不上學, 對嗎?
198 | They referred to Chaucer as the father of English poetry. 他們視喬叟為英詩之父。
199 | He said nothing, which made her angry. 他什么也没说,这让她很生气。
200 | I slapped his face. 我摑了他的臉。
201 |
--------------------------------------------------------------------------------
/nmt/en-cn/test_mini.txt:
--------------------------------------------------------------------------------
1 | He knows better than to marry her. 他聰明到不會娶她。
2 | He had hoped to succeed, but he didn't. 他本希望可以成功,但是他没有。
3 | This is the worst movie I have ever seen. 这是我看过的最差劲的电影了。
4 | She's in the bath. 她在浴室。
5 | Man is the only animal that can make use of fire. 人是唯一会使用火的动物。
6 | Every one of her songs was a hit. 她的每首歌都长期备受欢迎。
7 | He said that it would probably rain. 他說很可能會下雨。
8 | I wonder if I hurt Tom's feelings. 我不知道我是不是伤害了汤姆的感情。
9 | You chose this job yourself, right? 你自己选择了这份工作,是吗?
10 | Nobody is too old to learn. 沒有人會因為太老而不能學習。
11 | He drove the car, listening to music on the radio. 他一面開車一面聽收音機的音樂。
12 | I saw it with my own eyes. 我親眼看見了。
13 | What should I bring? 我该带些什么?
14 | He is no fool. 他没疯。
15 | She went there yesterday. 她昨天去那裡。
16 | What's the forecast for tomorrow? 明天的天气预报怎么说?
17 | She felt blue. 她感到闷闷不乐的。
18 | He climbed the stairs. 他爬上了樓梯。
19 | I told Tom I was OK. 我对汤姆说我很好。
20 | Were you in America last month? 你上個月在美國嗎?
21 | Let's synchronize our watches. 讓我們校對一下錶吧。
22 | How much money do I owe you? 我欠你多少錢?
23 | I am in the habit of taking some exercise before breakfast. 我在吃早餐前有運動的習慣。
24 | We enjoyed ourselves at the party. 我們在派對上玩得很開心。
25 | I just heard something. 我只是听到了一些消息。
26 | Where did you get your camera repaired? 你在哪儿修的照相机?
27 | I am from Shikoku. 我來自四國。
28 | You broke the rules. 你触犯了规则。
29 | He's my new friend. 他是我新交的朋友。
30 | Don't throw out this magazine. I haven't read it yet. 这本杂志不要扔。我还没看呢。
31 | I'd like to work at the cafeteria. 我想在餐廳工作。
32 | The building is seven stories high. 這棟建築物有七層樓高。
33 | She lives in New York. 她住在纽约。
34 | Don't be shy. 不要害羞。
35 | This is the best restaurant that I know. 这是我所知道的最好的餐厅。
36 | Mother made us cheese sandwiches for lunch. 媽媽為我們做了乳酪三明治當午餐。
37 | I am a stranger here. 我是這裡的外地人。
38 | Where's the nearest train station? 最近的火车站在哪里?
39 | How are you? 你們好嗎?
40 | How about going out for lunch? 出去吃午飯怎樣?
41 | I'm doubling my prices. 我正在把价格涨一倍。
42 | She kept me waiting for 30 minutes. 她让我等了30分钟。
43 | Has she finished the book yet? 她讀完這本書了嗎?
44 | I called her up. 我叫她起来。
45 | I'm still angry about that. 我还是为那生气。
46 | Tom picked up the knife. 汤姆拿起了刀。
47 | He must be about forty. 他应该40岁左右吧。
48 | We want to clear up this problem. 我们像澄清这个问题
49 | Many Asians have English as a common language. 許多亞洲人以英語作為共用的語言。
50 | More often than not, she had to go in person. 很多時候她要親自去。
51 | He couldn't hold his temper any longer. 他再也忍不住气了。
52 | I want him to read this. 想让他读这个。
53 | Tom knew how to do that. 汤姆知道怎么做那事。
54 | I want to go wherever you're going. 我想去你要去的地方。
55 | Clean your room. 清掃你的房間。
56 | He grew up to be an engineer. 他長大後成為了一名工程師。
57 | She came from Canada to see me. 她從加拿大來看我。
58 | I was struck by lightning. 我被雷劈了。
59 | Please drive the car more slowly. 請開車開慢一點。
60 | Let's not argue. 我們別吵了。
61 | Let me think. 讓我想一想。
62 | Please feed the dog every day. 请每天喂一下狗。
63 | Tom's eyes were glued to the screen. Tom的眼睛被荧幕吸引住了。
64 | Do you eat it in the classroom? 您在教室裡吃它嗎?
65 | He cleared his throat. 他清了清喉嚨。
66 | The tiger cub looked like a large kitten. 小老虎看起來像隻大貓。
67 | This letter is wrongly addressed. 这封信寄错地方了。
68 | What's wrong, honey? 出什么事了,宝贝?
69 | It's your turn. 輪到你了。
70 | Those are their books. 这是他们的书。
71 | Try it. 试试吧。
72 | Don't change your mind. 不要改變你的心意。
73 | I just cut my finger. 我剛剛切到手指了。
74 | I am getting off at the next station. 我下一站下车。
75 | He got up at five as usual. 他像往常一樣五點鐘起床。
76 | These three pretty girls are all nieces of mine. 这三个漂亮的女孩都是我的侄女。
77 | I am glad that you have succeeded. 我很高兴你们成功了。
78 | He lived abroad for many years. 他居住在國外多年。
79 | Another bottle of wine, please. 麻煩再一瓶葡萄酒。
80 | He asked me two questions. 他問了我兩個問題。
81 | I have finished cleaning my room. 我已經打掃完我的房間了。
82 | Don't let him bite you. 別讓他咬你。
83 | They made fun of Mary. 他們取笑瑪麗。
84 | I think it's time for you to grow up. 我觉得你是时候该懂事了。
85 | I asked him what his name was. 我問了他叫什麼名字。
86 | Feel free to ask any questions. 隨時問任何問題都可以。
87 | Speaking English is useful. 说英语很有用。
88 | She didn't like her husband. 她不喜歡她的丈夫。
89 | He complained about the noise. 他抱怨這個噪音。
90 | You'll never be alone. 你们永远不会一个人的。
91 | If you hurry, you will catch up with him. 如果你快一点,还能赶上他。
92 | It was dark when I reached the hotel. 我到旅馆的时候,天已经黑了。
93 | All of my friends got asked to dance except me. 除了我,其他人都被邀请去跳舞了。
94 | Where are my books? 我的書在哪?
95 | She's collecting material for a book. 她为一本书收集材料。
96 | He belongs to the camera club. 他參加攝影社。
97 | He took off his overcoat. 他脫掉了大衣。
98 | I swear I didn't do anything. 我发誓我什么也没做。
99 | Cut the chit-chat and get to work. 別閒聊了,開始工作。
100 | He works from nine to five-thirty. 他從早上九點工作到下午五點半。
101 | I think you should take the test again. 我的意见是,你该再次参加测试。
102 | We have to put off the game till next Sunday. 我们不得不把游戏搁到下周日了。
103 | Eat your soup while it is hot. 趁热喝你的汤。
104 | There are almost no books. 幾乎沒有任何書籍。
105 | I have a big dog. 我有条大狗。
106 | Have you finished breakfast yet? 你吃完早飯了嗎?
107 | The phone rang. 電話正在響。
108 | My uncle had me act as a translator. 我叔叔讓我擔任翻譯。
109 | I don't know where he lives. 我不知道他住在哪裡。
110 | I'm a big football fan. 我是足球的忠实粉丝。
111 | There is little water in the pond. 這個池塘裡的水很少。
112 | When did you begin studying English? 您什么时候开始学英语的?
113 | That architect builds very modern houses. 那个建筑师创建了非常现代的房屋。
114 | Can anyone drive? 有人会开车吗?
115 | He saluted the lady. 他向那位女士问好。
116 | He does speak well. 他真的說得很好。
117 | I have no particular reason to do so. 我没有特别的理由去做。
118 | Could you do me a favor? 請你幫我一個忙好嗎?
119 | The space race was an exciting time in history. 太空跑步是历史上激动人心的时刻。
120 | We are at school together. 我们一起在学校。
121 | They called off their engagement. 他們解除了婚約。
122 | Tom is a true man. 汤姆是个真男人。
123 | English is a universal language and is used all over the world. 英語是一種世界性的語言,用於世界各地。
124 | Tom watched TV yesterday. Tom昨天看了电视。
125 | It's not practical. 这不实用。
126 | We have a great team. 我们有个好极了的团队。
127 | No less than three hundred dollars was needed for the work. 这个工作需要不低于300美金。
128 | Beware of the dog! 小心狗!
129 | This is a strange sentence. 這是一個奇怪的句子。
130 | Do you study every day? 你每天都学习吗?
131 | Furniture made of good materials sells well. 用优质材料做的家具卖得很好。
132 | I'm not accustomed to getting up early. 我不習慣早起。
133 | Do you speak Chinese? 你會說中文嗎?
134 | School starts in September in Europe. 歐洲的學校在九月開學。
135 | My grandson is still a baby. 我的孫子還是個嬰兒。
136 | The button came off. 這顆鈕扣脫落了。
137 | He had his hair cut short. 他把头发剪短了。
138 | I'll see you next Wednesday. 我下星期三见你。
139 | He treats me like his slave. 他對待我就像他的奴隸。
140 | She doesn't live there any more. 她不再住在那裡了。
141 | You may choose any of them. 你可以選擇他們之中的任何一個。
142 | She's not as old as Mary. 她沒有瑪麗年紀大。
143 | If you pass this test, you could graduate next month. 如果你通過這個考試,你下個月就可以畢業了。
144 | Neptune is the eighth planet of the solar system. 海王星是太阳系第八个行星。
145 | May I put it down here? 我可以把它放在这儿吗?
146 | Jesus loves you. 耶穌愛你。
147 | That's the point. 这正是问题的关键。
148 | My father died four years ago. 我的父親四年前去世了。
149 | A dolphin is a mammal. 海豚是哺乳動物。
150 | I don't like kids. 我不喜欢小孩。
151 | I recognized your voice right away. 我立刻認出了你的聲音。
152 | Tom used to work here. 汤姆过去在这里工作。
153 | You must clear the table. 你必须把桌子清理干净。
154 | The school needed a new teacher. 學校需要一個新的老師。
155 | Can you do that by yourself? 你自己一個人能做嗎?
156 | I'm really scared of spiders. 我真怕蜘蛛。
157 | I know that she is Spanish. 我知道她是西班牙人。
158 | I suppose I'd better phone Tom. 我想我给汤姆打电话比较好。
159 | He paid 1,000 yen for this book. 他花了1000日元買這本書。
160 | The fire alarm rang. 火警警报响了。
161 | You have already eaten the cake. 你已经把蛋糕吃了。
162 | Which is your book? 哪本是你們的書?
163 | I did not expect it to be that big. 我没想到它有那么大。
164 | I have ten pens more than you do. 我比你多10支钢笔。
165 | Let's get together again! 讓我們再聚在一起!
166 | According to newspaper reports, there was an airplane accident last evening. 根據報載,有一架飛機昨天晚上發生了意外。
167 | The police assembled a lot of evidence against him. 警察收集了很多对他不利的证据。
168 | Has something good happened? 發生了什麼好事嗎?
169 | A stitch in time saves nine. 小洞及時補,免遭大洞苦。
170 | Tom doesn't have any shoes on. 汤姆没穿鞋。
171 | I'm not sure. 我不确定。
172 | I want to buy this dictionary. 我想買這本字典。
173 | I've gotten better. 我已經變得好多了。
174 | The job is half done. 這項工作已經完成了一半。
175 | Tom told me that he doesn't like carbonated drinks. 汤姆告诉我他不喜欢碳酸饮料。
176 | Tom was unsatisfied with the results. 汤姆对结果不满。
177 | He died soon after the accident. 他在事故後不久就去世了。
178 | Would you please open the door? 請你開門好嗎?
179 | What I want is some peace and quiet. 我所想要的是一点平和和安静。
180 | What time did he say he'd come? 他說他幾點鐘會來?
181 | This meal is adequate for three. 这饭足够三个人吃。
182 | A devastating earthquake hit the state capital. 一場毀滅性的地震襲擊了這個州的首府。
183 | You shouldn't talk back to your parents like that. 你不應該對你父母那樣頂嘴。
184 | I need to know what you think. 我需要知道你怎么想。
185 | Facebook is blocked in China. Facebook在中國是被封鎖的。
186 | Pretend you're me. 假装你是我。
187 | I haven't finished this. 我做不了這個。
188 | Tom came to see if Mary needed any help. 汤姆过来看看玛丽有没有什么需要帮忙的。
189 | You lied to me, didn't you? 你對我說了謊, 沒有嗎?
190 | I'm always forgetting people's names. 我總是忘記別人的名字。
191 | Isn't that mine? 那是我的吗?
192 | "The good die young" is an old saying which may or may not be true. “好人不长命”是句或真或假的老话。
193 | Don't underestimate your own strength. 不要低估自己的实力。
194 | He kept a diary during the trip. 他旅行期间,写了旅游日记。
195 | A true scientist would not approach the question this way. 真正的科学家不会这样去思考。
196 | I'll see to it. 由我来做.
197 | Tom nodded approval. 汤姆点头同意。
198 | Tom denied having stolen the money. Tom否认偷了钱。
199 | Are you going to visit any other countries? 你会去访问其他国家吗?
200 | He wrote to me yesterday. 昨天他寫信給我。
201 |
--------------------------------------------------------------------------------
/nmt/en-cn/train_mini.txt:
--------------------------------------------------------------------------------
1 | Anyone can do that. 任何人都可以做到。
2 | How about another piece of cake? 要不要再來一塊蛋糕?
3 | She married him. 她嫁给了他。
4 | I don't like learning irregular verbs. 我不喜欢学习不规则动词。
5 | It's a whole new ball game for me. 這對我來說是個全新的球類遊戲。
6 | He's sleeping like a baby. 他正睡着,像个婴儿一样。
7 | He can play both tennis and baseball. 他既会打网球,又会打棒球。
8 | We should cancel the hike. 我們應該取消這次遠足。
9 | He is good at dealing with children. 他擅長應付小孩子。
10 | She will do her best to be here on time. 她会尽量按时赶来的。
11 | Why are you so good at cooking? 为什么你做饭那么拿手呢?
12 | He has recovered from his bad cold. 他从重感冒中恢复了过来。
13 | It's a dead end. 这是个死胡同。
14 | I rejected the offer. 我拒绝了报价。
15 | He often quotes Milton. 他常引用米爾頓。
16 | Mommy, may I go swimming? 媽咪,我可以去游泳嗎?
17 | Miyazaki is not what it used to be. 宮崎不是它往日的樣子了。
18 | People must love one another. 人要爱他人。
19 | Where do you have pain? 你哪裡痛?
20 | Keep oil away from the fire. 讓油遠離火。
21 | This is never going to end. 这将永远继续下去。
22 | He had a hungry look. 他面有飢色
23 | I don't know which button to push. 我不知道要按哪個按鈕。
24 | He heard the news on the radio. 他從收音機聽到了這個消息。
25 | Don't leave the water running. 不要讓水一直流。
26 | I'm opposed to any type of war. 我反對任何形式的戰爭。
27 | I had a hard day. 我过了难挨的一天。
28 | He will succeed to the throne. 他会继承王位。
29 | Please give me something hot to drink. 請給我一些熱的東西喝。
30 | Have fun. 玩得開心。
31 | The library is on the second floor. 圖書館在二樓。
32 | Look at me with your books closed. 把你的書閤起來看著我。
33 | I still love him. 我依旧爱着他。
34 | His low salary prevents him from buying the house. 他的低薪水让他买不了房。
35 | Hang your coat on the hook. 把你的外套掛在鉤子上。
36 | I can't understand his feelings. 我不明白他的感受。
37 | She didn't try to hide the truth. 她没有试图掩盖真相。
38 | Dad bought me a camera. 爸爸给我买了一个照相机。
39 | I love her and she loves me. 我愛她,她也愛我。
40 | The boy has learned to read. 男孩學會了閱讀。
41 | You are irresistible. 你是不可抗拒的。
42 | It was heartless of him to say such a thing to the sick man. 他对一个生病的男人说这种事真是没良心。
43 | He calls her up every night. 他每天晚上打電話給她。
44 | They left one after another. 他们一个接着一个地离开了。
45 | Father always has the tailor make his suits. 父親總是讓這位裁縫師為他做西裝。
46 | We got him to carry our bag. 我們讓他幫我們拿袋子。
47 | He drove the truck to Dallas. 他開卡車到達拉斯。
48 | The baby smiled at me. 宝宝对我笑了。
49 | Did he propose to you? 他向你求婚了嗎?
50 | Cheese is made from milk. 奶酪是用奶做成的。
51 | I help my mother with the housework every day. 我每天都帮我妈做家务。
52 | I spoke with him about the matter. 我跟他談過這個問題。
53 | The friend who I thought would pass the exam failed it. 那个我认为会通过考试的朋友失败了。
54 | They are very big apples. 他們是非常大的蘋果。
55 | How dare you! 你敢!
56 | He must have taken the wrong train. 他一定是搭錯火車了。
57 | I understand how to solve the problem. 我明白怎么解决问题。
58 | How many children do you have? 你有幾個小孩?
59 | He no longer lives here. 他不再住在這裡了。
60 | He is the same age as me. 他和我同岁。
61 | Industry as we know it today didn't exist in those days. 我们今天所认识的工业在那个时代不存在。
62 | That boy used to drop in on me. 那男孩儿来访过我家。
63 | I think you'll have very little difficulty in getting a driver's license. 我想你要拿到驾照根本不难。
64 | Run. 你用跑的。
65 | He told me to do it, so I did it. 他让我做,我就做了。
66 | That is not my line. 这不是我拿手的。
67 | She lives in a large house. 她住在一棟大房子裡。
68 | A doctor told me that eating eggs was bad for me. 一位医生告诉过我,吃鸡蛋对我的健康有害。
69 | She asked him to open the window. 她請他打開窗口。
70 | She has a flower in her hand. 她手上有一朵花。
71 | I have just finished my homework. 我剛剛完成我的作業。
72 | You look pale today. 你今天看上去很苍白。
73 | I'm counting how many people there are. 我正在算有多少人在那裡。
74 | I felt that my honor was at stake. 我覺得我的名譽受到了威脅。
75 | Money isn't the only thing that matters. 不只是钱关系重大。
76 | Tom just never should've done that. 汤姆就不该做那事。
77 | I don't want there to be any misunderstanding. 我不想有任何误会。
78 | He may not be happy. 他可能不高兴。
79 | The hunter shot the fox dead. 獵人射殺了狐狸。
80 | He goes to school to study every day. 他每天去学校学习。
81 | I spent last Sunday reading a novel. 我上週日花時間看了一本小說。
82 | Mary closed the door quietly. 瑪麗悄悄地關上了門。
83 | You smoke far too much. You should cut back. 你吸太多煙了。你應該少抽一點。
84 | I wish I could have spoken Spanish. 要是我會說西班牙語就好了。
85 | There's nothing I wouldn't do for Tom. 没有我不会给汤姆做的事。
86 | Tom was making French fries. 汤姆想做炸薯条。
87 | I study for 3 hours every day. 我每天讀書三個小時。
88 | He was home alone at the time. 他当时一个人在家。
89 | I'm tired of listening to his boasts. 我厭倦了聽他吹噓。
90 | Because of heavy snow, the plane from Beijing arrived 20 minutes late. 由於下大雪,從北京来的班機延遲二十分鐘。
91 | You may use my new car. 你可以使用我的新車。
92 | Where is the vodka? 伏特加在哪里?
93 | If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language. 如果一個人在成人前沒有機會習得目標語言,他對該語言的認識達到母語者程度的機會是相當小的。
94 | He hurried so he wouldn't miss the train. 汤姆加紧步伐以不错过火车。
95 | Don't go out after dark. 天黑以後不要出門。
96 | They arrived here safely yesterday. 他們昨天平安抵達這裡。
97 | None of those books are useful. 這些書裡沒有一本是有用的。
98 | I just said something very stupid. 我刚说了很蠢的话。
99 | If I were you, I'd want to know what Tom is doing right now. 如果我是你,我不会想去知道Tom现在正在做什么。
100 | I often went fishing with him. 我經常和他去釣魚。
101 | I hope we find Tom. 我希望我们能找到汤姆。
102 | You used to be able to see the church from here. 你以前可以從這裡看到教堂。
103 | Can we do it? 我们能做到吗?
104 | I've never told anyone about this. 我没跟任何人说过这个。
105 | One thousand dollars will cover all the expenses for the party. 1000美元将负担聚会的全部费用。
106 | Fire is always dangerous. 火總是危險的。
107 | The meeting was called off. 会议取消了。
108 | They welcomed me warmly, so I felt at home. 他们这么热情的欢迎我,让我感觉家人一样。
109 | Tom pulled Mary out of the water. Tom 把Mary拉出水
110 | I have class tomorrow. 我明天有课。
111 | Tom fell asleep in class. 汤姆在课堂上睡着了。
112 | Tom can't get his car started. 汤姆没法发动他的车。
113 | This is a very beautiful flower. 這是一朵非常美麗的花。
114 | Best wishes from all of us. 我们所有人都祝福你。
115 | My aunt gave me a camera. 我的阿姨給了我一台攝影機。
116 | I don't understand music. 我不懂音乐。
117 | I wish I was young again. 我希望我再年輕一次。
118 | I'm not guilty of anything. 我没有对任何事感到有罪。
119 | A bird can fly. 鸟会飞。
120 | His office is near the train station. 他的辦公室離車站很近。
121 | A swarm of hornets attacked the children. 一窩黃蜂襲擊了孩子們。
122 | She stirred her coffee with a teaspoon. 她用茶匙搅她的咖啡。
123 | When are you going to quit smoking? 你何時要戒煙?
124 | He ran into the room. 他跑进房间内。
125 | What are we having for dinner? 我们晚饭吃什么?
126 | You'll get lost. 你会迷路的。
127 | My cell phone has a built-in digital camera. 我的手機有內建的數位相機。
128 | Hold your horses, young man. 別那麼猴急,年輕人。
129 | Six divided by two equals three. 六除以二得三。
130 | I am able to drive a car. 我會開車。
131 | I could answer all the questions. 我可以回答所有问题。
132 | Can you do bookkeeping? 你會記帳嗎?
133 | They're all the same size. 他們都是一樣的大小。
134 | What do you two do for fun? 你们两个做了什么有趣的事情呢?
135 | No one lives in this building. 没有人住在这栋楼里。
136 | I'm sick and tired of hamburgers. 我對漢堡感到厭煩了。
137 | They married when they were young. 他們在還很年輕的時候就結婚了。
138 | I would have liked to come with you, but I didn't have time. 我想和你一起去,但是我没有时间。
139 | Academic fraud is more common than you might think. 学术造假比你想象的普遍。
140 | That's exactly what I wanted to see happen. 这刚好是我想看到它发生的。
141 | I'm afraid there isn't any coffee left. 恐怕已經沒有咖啡了。
142 | While I was reading in bed last night, I fell asleep with the light on. 我昨晚在床上看书的时候点着灯就睡了。
143 | He'll be back home soon. 他很快就會回家。
144 | We sometimes swim in the lake. 我們偶爾在湖裡游泳。
145 | Charles Lindbergh made the first solo flight across the Atlantic Ocean in 1927. Charles Lindbergh於1927年成功完成了第一次獨自飛越大西洋。
146 | His shirt was stained with sauce. 他的衬衫被酱汁弄脏了。
147 | I am just going for a walk. 我只是去散散步。
148 | We found the front door locked. 我們發現前門被鎖上了。
149 | He is so heartless. 他是这么的无情。
150 | I'm playing a TV game. 我在玩電視遊樂器。
151 | Let's go now. 我们现在去吧。
152 | Control is everything. 控制就是一切。
153 | Swimming is easy. 游泳很容易。
154 | He held a package under his arm. 他挾著一個包裹。
155 | She looks pretty in that dress. 她穿上那件衣服看起來很漂亮。
156 | Why are you here? 你為什麼在這?
157 | She asked us to leave her alone. 她要求我們別吵她。
158 | Thanks very much for having me to dinner the other night. 谢谢那天晚上请我吃了饭。
159 | After they argued, they didn't speak to each other for a week. 他们争吵后,一周都没有再说话。
160 | Tom slipped out of the classroom. 湯姆溜出了教室。
161 | You ought to ask for your teacher's permission. 你應該請求你的老師允許。
162 | The clock has stopped. 時鐘已經停止了。
163 | When did your friend leave for America? 你朋友是什么时候出发去美国的?
164 | They grow flowers in the garden. 他們在花園裡種花。
165 | She told me an interesting story. 她给我讲了一个有趣的故事。
166 | I received her letter yesterday. 昨天我收到了她的信。
167 | I slept on the bus. 我在公交车上睡觉了。
168 | Stand aside. 一边站着。
169 | By the year 2020, the population of our city will have doubled. 在2020年以前,我們的城市的人口將增加一倍。
170 | Tom has been really busy recently. 汤姆最近相当忙。
171 | The swimmers were numb with cold. 游泳選手們凍僵了。
172 | Many moons orbit around Saturn. 許多衛星繞著土星運行。
173 | I spent the whole day in reading the novel. 我一整天都在看这本小说。
174 | Volcanic ash disrupted air travel. 火山灰阻礙航運。
175 | I'm surprised that you don't know about their marriage. 我很惊讶,你竟然不知道他们结婚。
176 | Mom, I'm hungry. 妈妈,我肚子饿了。
177 | He earned his living as a teacher. 他以當老師為生。
178 | She loves chocolate, too. 她也喜欢巧克力。
179 | It has been cold since yesterday. 從昨天開始變冷了。
180 | She must have been sick. 她一定是生病了。
181 | I'm not very good at swimming. 我不是很擅長游泳。
182 | Switch on the light. I can't see anything. 把灯打开。我什么都看不见了。
183 | Do you drink coffee? 你喝咖啡嗎?
184 | I promise I'll do that tomorrow. 我保證明天就做
185 | Some people never grow up. 有些人永远也长不大。
186 | I will write to you soon. 我會盡快寫信給你。
187 | I'm starving! 我饿死了!
188 | This place has a mysterious atmosphere. 这个地方有一种神秘的气氛。
189 | It takes 165 years for Neptune to orbit around the sun. 海王星繞行太陽一周要花一百六十五年。
190 | The curtain fell. 谢幕了。
191 | I expect you to be punctual. 我期待你能準時。
192 | While reading a book, I fell asleep. 我看书的时候睡着了。
193 | You shouldn't have eaten so much ice cream. 你不應該吃這麼多冰淇淋。
194 | You have a point there. 喔!你提到一個重點了。
195 | Quit gambling. 戒掉赌博吧。
196 | Cows provide us with milk. 奶牛为我们提供牛奶。
197 | She enjoyed herself at the concert. 她在音樂會上玩得很開心。
198 | She gave me a strange look. 她奇怪地看了我一眼。
199 | I am an optimist by nature. 我天生是一個樂觀主義者。
200 | He erased his speech from the tape. 他把他的那段话从磁带里删除了。
201 | Do you have a larger size? 你有比較大的尺寸嗎?
202 | Do they love each other? 他們彼此相愛嗎?
203 | Halley's Comet will come back in 2061. 哈雷彗星將在2061回來。
204 | I'm not living with him anymore. 我不再跟他一起生活了。
205 | My aunt brought me some flowers. 我阿姨給我帶來了一些花。
206 | I don't have a computer at home. 我家没有电脑。
207 | I returned the book to the library. 我把書還給圖書館。
208 | Who can tell the difference? 谁能说清不同点?
209 | I'll eat here. 我會在這裡吃飯。
210 | Tom needed treatment. 汤姆需要接受治疗。
211 | She must still be in her twenties. 她一定還只是二十幾歲。
212 | I wish Tom were my younger brother. 但願湯姆是我的弟弟。
213 | Do you eat out often? 你常常外食嗎?
214 | I can do it alone. 我可以獨自做。
215 | He is very stingy with his money. 他非常吝啬。
216 | Give me a bottle of wine. 給我一瓶葡萄酒。
217 | I really want to speak English fluently. 我很想流利地说英语。
218 | The Oscar ceremonies are Hollywood's biggest extravaganza. 奥斯卡颁奖典礼,是好莱坞最盛大的活动。
219 | Do you read French every day? 你每天读法语吗?
220 | How much is the rent per month? 一個月的租金多少?
221 | The clock stopped. 鐘停了。
222 | Who is to blame for the accident? 誰該為這次事故負責?
223 | I will give you a bike for your birthday. 你生日的时候,我送你一辆自行车。
224 | Deal with them. 解决他们。
225 | She squeezed the juice from many oranges. 她用了許多柳橙來榨汁。
226 | I've never seen that guy before. 我從沒看過那個人。
227 | It's obvious why his stomach hurts. 他為什麼會胃痛的原因很明顯。
228 | I wish I had a better memory. 但願我有好一點的記憶力。
229 | I don't feel like studying English today. 我今天不想学习英语。
230 | Sorry to have kept you waiting. 對不起讓你一直等。
231 | I don't think that she will come. 我不認為她會來。
232 | No student went to the party. 没有学生去参加派对。
233 | The roof was damaged by the storm. 屋頂被暴風雨損壞了。
234 | Today our artificial satellites are revolving around the earth. 今天我们的人造卫星正在环绕地球运转。
235 | He teaches English to his friends. 他對他的朋友教英語。
236 | Have you turned in your report? 你交報告了嗎?
237 | He wants a book to read. 他想找本書來讀。
238 | Sorry it took me so long to write to you. 對不起, 我過了這麼長的時間才回你的信。
239 | They forgot to lock the door. 他们忘了锁门。
240 | Have you answered that letter yet? 你回信了嗎?
241 | Tom told me he had a right to see it. 汤姆告诉我他有权看它。
242 | She really does like animals. 她确实喜欢动物。
243 | What's the climate there like? 那里的气候怎么样?
244 | I like children. That's why I became a teacher. 我喜欢孩子。这就是为什么我成为了教师。
245 | Here are some pictures. 這裡有一些圖片。
246 | May I have a glass of milk, please? 请问能给我一杯牛奶吗?
247 | I'll be at home in the morning. 我早上會在家。
248 | Will you show me your passport, please? 能否请您给我看一下您的护照?
249 | My father died of lung cancer. 我的父親死於肺癌。
250 | How did you enjoy the concert? 你有多喜歡這場音樂會?
251 | This is exactly what I wanted. 我想要的就是这个。
252 | That's my problem. 這是我的問題。
253 | Please let me go. 請允許我去。
254 | I've missed another chance. 我又失去了一次机会。
255 | I have a cough and a little fever. 我咳嗽,还有点发烧。
256 | I managed to get there in time. 我設法及時到那裡。
257 | The lock is broken. 锁坏了。
258 | Half of these apples are rotten. 這些蘋果的其中一半都爛了。
259 | I changed trains at Tokyo Station. 我在東京站換火車。
260 | She's in the garden planting roses. 她在花园里种玫瑰。
261 | Tom can't tell Mary his real feelings. 汤姆不能把他的真实感受告诉玛丽。
262 | The guests are in the kitchen. 客人們在廚房裏。
263 | Would you like some of those pictures? 你想要那些照片中的一些嗎?
264 | I appreciate your cooperation. 我感謝您的合作。
265 | I like swimming, but I don't like to swim here. 我喜欢游泳,但我不想在这里游。
266 | I don't mind if it's a little cold. 稍微冷一点没关系。
267 | The weatherman says there is a storm on the way. 气象学家说会有暴风雨。
268 | How about a drink after the game? 比賽結束後喝一杯怎麼樣?
269 | After you have read it, give the book back to me. 在你讀完後,把書還給我。
270 | I have a new car. 我有辆新车。
271 | I graduated from high school last year. 去年我從高中畢業了。
272 | Keep going straight. 繼續直行。
273 | Smoking is prohibited. 禁止吸烟。
274 | Get them before they get us. 先下手为强,后下手遭殃。
275 | Tom went into the kitchen and poured himself a cup of coffee. 汤姆进了厨房,给自己倒了杯咖啡。
276 | I think that's awful. 我看那糟透了。
277 | Orange juice, please. 柳橙汁,麻煩你。
278 | I continued singing. 我繼續唱歌。
279 | They rescued the boy from drowning. 他們救了這個落水的男孩。
280 | Tom is on the go day and night. 汤姆日夜兼程。
281 | You must be careful of the traffic when you cross the street. 横穿马路时你要留心。
282 | He plays golf two or three times a month. 他一个月玩两到三次高尔夫。
283 | Tom slept until noon. 汤姆睡到中午。
284 | He doesn't know how to swim. 他不会游泳。
285 | He'll do whatever you ask him to. 他會做任何你要求他做的事。
286 | Whether you like it or not doesn't matter. 你喜不喜欢没关系。
287 | You have no right to interfere in other people's affairs. 你沒有干涉他人事務的權力。
288 | I'll alert Tom. 我会警告汤姆。
289 | No problem. 没关系。
290 | You may take either the big box or the small one. 你可以拿大的盒子或是小的盒子。
291 | Do you know where Tom is waiting for us? 你知道汤姆在哪里等我们吗?
292 | I'm doing my homework. 我正在做我的作业。
293 | Nature is full of mysteries. 自然充满了神秘。
294 | You're barking up the wrong tree. 你白費力氣了。
295 | I'm not letting Tom do this by himself. 我不会让汤姆独自去做。
296 | She would often go to the theater when she was in London. 她在伦敦的时候,曾经常去剧院。
297 | We'll need a head hunting agency to find the right man for this executive position. 我們需要人力仲介公司幫我們找到合適的人來擔任這個管理職位。
298 | Can anyone believe you? 誰會相信你?
299 | He is making great progress in English. 他的英語有很大的進步。
300 | My father has been engaged in foreign trade for many years. 我父亲经营外贸多年。
301 | She has lost weight. 她的体重减轻了。
302 | I'm tired of your complaints. 我已經厭倦了你的投訴。
303 | I don't want to have an operation. 我不想接受手術。
304 | I like this color as well. 我也喜歡這顏色。
305 | I need to try. 我需要尝试。
306 | He left his umbrella on the bus. 他把雨伞忘在公交车上了。
307 | My aunt made me a new skirt. 我阿姨做了一條新裙子給我。
308 | She used to go to the movies on Sundays. 她從前會在星期天去看電影。
309 | Everyone knew Tom was the one who did it. 大家都知道是汤姆做的
310 | You might not like this beer at first. It's an acquired taste. 剛開始的時候你可能不會喜歡這個啤酒。這是需要多次品嚐去習慣它的口味。
311 | I am interested in music. 我對音樂有興趣。
312 | I don't get what you mean. 我不明白你的意思。
313 | Where in Turkey do you live? 你在土耳其哪儿生活?
314 | She showed us a beautiful hat. 她給我們看了一頂漂亮的帽子。
315 | He didn't come to the last meeting. 他最後一場會議沒來。
316 | It can be dangerous. 它有危险。
317 | I have no intention of asking him. 我不想問他。
318 | I agreed to the proposal. 我同意這項建議。
319 | It is like looking for a needle in a haystack. 这好比大海捞针。
320 | Summer has ended. 夏天已經結束。
321 | It's a very sad story. 这是一个非常悲伤的故事。
322 | We're out of stock now. 我们现在缺乏库存。
323 | Turn off the television. I can't concentrate. 把電視關掉。我無法專心。
324 | We need some more coffee. 我們需要多一點咖啡。
325 | Most boys like baseball. 大部分男生喜欢棒球。
326 | You should have told me a long time ago. 很久以前你就應該告訴我的。
327 | Swimming is good for your health. 游泳對你的健康很好。
328 | This is Uncle Tom's farm. 這是湯姆叔叔的農場。
329 | "Pass me the salt, please." "Here you are." “请把盐递给我。”“拿着。”
330 | You are a student. 你是學生。
331 | May I ask how old you are? 请问您老人家高寿?
332 | I've finished watering the flowers. 我已經澆完花了。
333 | Tom wanted Mary to say that she loved him. 汤姆想让玛丽说爱他。
334 | Please answer this question for me. 請回答我這個問題。
335 | Does she know your telephone number? 她知道您的電話號碼嗎?
336 | The picture looks better at a distance. 这幅画远看更好。
337 | I invited them to the party. 我邀請了他們參加派對。
338 | He returned from China. 他从中国回来了。
339 | The accident seemed to have something to do with the heavy snow. 事故似乎和厚厚的积雪有关。
340 | Do we have anything I can snack on? 我們有任何我可以當零食吃的東西嗎?
341 | I think his opinion is very important. 我認為他的意見非常重要。
342 | I am looking forward to Christmas. 我期待聖誕節的到來。
343 | Do you like to cook Japanese foods? 你喜歡煮日本料理嗎?
344 | Why should it be different? 为什么它应该要变得不一样?
345 | Computers are certainly playing an important role in our life, whether we like it or not. 无论我们是否喜欢电脑,它在我们的生活中始终起着重要的作用。
346 | Tom was all worn out. 湯姆完全筋疲力盡了。
347 | All the boys in class worked hard. 課堂上所有的男生都很用功。
348 | Studying abroad is very common now. 現在出國留學是很常見的了。
349 | Tom burst into tears. 汤姆泪流满面。
350 | Why do I have to do that? 我为什么一定要那么做?
351 | That bicycle is too small for you. 那輛腳踏車對你來說太小了。
352 | Do you know how to recover a deleted file? 你知道怎么恢复已删除的文件吗?
353 | Keep the dog out. 别让狗进来。
354 | I don't like novels without heroes. 我不喜欢没有英雄的小说。
355 | Is there a zoo in the park? 公园里有动物园吗?
356 | Tom suggested that I change the lock on my door. 汤姆建议我换我的门锁。
357 | I am very, very sorry. 我非常,非常抱歉。
358 | The best hairdressers are gay. 最好的理发师是同性恋。
359 | I love elderberry juice. 我愛接骨木果汁。
360 | She studied English in the morning. 她上午學習英語。
361 | The death penalty had been done away with in many states in the USA. 死刑在美国的很多州都被废除了。
362 | I understand perfectly. 我完全明白。
363 | What do you think of him? 你覺得他怎麼樣?
364 | Tom is an extraordinary man. 汤姆是个不寻常的人。
365 | The phone is ringing. 電話正在響。
366 | Could you check the tire pressure? 你能檢查一下這個輪胎的氣壓嗎?
367 | The charge for a front row seats is 5 dollars. 第一排的座位5美元。
368 | Obey your teachers. 要听老师的话。
369 | No words can express her deep sorrow. 她的悲伤无法言喻。
370 | His house was small and old. 他的房子又小又旧。
371 | You must get up a little earlier. 你该早一点起床。
372 | Please show your ticket. 请出示您的票子。
373 | He took a week off. 他休了一周的假。
374 | Could you please not smoke in this room? 请问你能不在房间里吸烟吗?
375 | I'm sorry for the late response. 我很抱歉回复晚了。
376 | My brother and I are in the same class. 我的兄弟和我在一个班级。
377 | There are about forty students in her class. 她班上大约有40个学生。
378 | I really want to know what's going on. 我真想知道发生了什么。
379 | The shell of an egg is easily broken. 蛋壳容易破碎。
380 | You've got a lot of guts. 你膽子很大。
381 | It's said that she loves him. 據說她愛他。
382 | I didn't like it. 我没有喜欢过。
383 | What do you have the first period? 你第一節課上什麼?
384 | He is now either in Rome or in Paris. 他不是在羅馬,就是在巴黎。
385 | He is a doctor by profession. 他的职业是医生。
386 | I'd like to be a guitarist. 我想要成为吉他手。
387 | Starting tomorrow, it's going to snow for a week. 雪从明天开始下,一直持续一个星期。
388 | My bicycle has a flat tire. 我的腳踏車輪胎沒氣了。
389 | Of course she can speak English. 她當然會講英語。
390 | Take things a little more seriously. 對事情比較正經嚴肅。
391 | Could I have a pillow and blanket? 給我一個枕頭和毛毯好嗎?
392 | Get out of here. 離開這裡。
393 | Stir the soup. 搅一下汤。
394 | We're almost broke. 我們快破產了。
395 | How many apples do you want? 你要多少個蘋果?
396 | There's a telephone in my room. 我的房間裡有一支電話。
397 | What prevented you from coming earlier? 為什麼你不能早點來?
398 | I don't need your help. 我不需要你的幫助。
399 | I'm at the airport now. 我现在在机场。
400 | Haven't we met before? 我们以前没见过吗?
401 | That's my final answer. 这是我的最终回答。
402 | I'm tired of her complaints. 我厭倦了她的抱怨。
403 | You must keep quiet for a few days. 你该安静几天。
404 | He asked her where her mother was. 他問她她的母親在哪裡。
405 | This law is applicable to all cases. 此法適用於所有情況。
406 | I want to go with you. 我想和你一起去。
407 | It's healthy to breathe deeply. 深呼吸有益健康。
408 | When she saw that they had no schools, she started one. 当她发现他们还没有学校,她就办了一个。
409 | I'm sure you'll love what we have on the menu tonight. 我肯定你会喜欢我们今晚的菜肴。
410 | Have you taken your medicine yet? 您已经吃过药了吗?
411 | The couple is walking hand in hand. 這對夫妻手牽手走路。
412 | There was a car accident yesterday. 昨天發生了一場車禍。
413 | Life is like a box of chocolates. 生活就像一盒巧克力。
414 | If she was displeased, she never showed it. 如果她不高興, 她從來不表現出來。
415 | I saw him running. 我看見了他跑步。
416 | Stick a stamp on the envelope. 把郵票貼在信封上。
417 | Do you want to know why I quit? 您想知道为什么我要离开吗?
418 | Who was in charge of today's party? 誰負責今天的派對?
419 | Is there an app for that? 有它的应用吗?
420 | If a sick person folds one thousand paper cranes, her wish will come true. 如果一個病人折一千隻紙鶴, 她的願望就會成真。
421 | Never forget to put out the fire. 永遠不要忘記關火。
422 | Do you plan to stay long? 你打算長時間停留嗎?
423 | I haven't seen Tom since 1988. 我從1988年起就沒有看過湯姆了。
424 | We made it out of there. 我們從那裡逃了出來。
425 | My father works at the factory. 我父亲在工厂工作。
426 | I usually go to bed before ten. 我通常在十點前上床睡覺。
427 | What is my room number? 我的房间号是多少?
428 | The man got away from the city. 這名男子逃離了這個城市。
429 | I don't blame you for doing that. 我不怪你那樣做。
430 | I missed you. 我想你。
431 | I'll give you anything that you want. 我會給你任何你想要的東西。
432 | I know the girl playing tennis. 我認識這個打網球的女孩。
433 | Thank you for inviting me to dinner. 谢谢你邀请我吃饭。
434 | She greeted us with a smile. 她面帶微笑向我們打招呼。
435 | Entering the house, I tripped over the mat. 进家门后,我被垫子绊倒了。
436 | If we knew what we were doing, it wouldn't be called research, would it? 如果我们知道我们在做什么,那么这不能称之为研究,是吗?
437 | The bookcase is level with the table. 书架和桌子齐平。
438 | There's too much salt in this soup. 这汤里盐放多了。
439 | You're an optimist. 你是个乐观主义者。
440 | The box is too heavy to carry. 這個箱子太重了無法攜帶。
441 | Tom was there this morning. 汤姆早上去过那里。
442 | I don't understand what he said. 我不明白他说的话。
443 | I treated her as my own daughter. 我把她當成是我自己的女兒一樣對待。
444 | We haven't finished eating the watermelon yet. 我們還沒吃完這個西瓜。
445 | He did not know what to say. 他不知道说什么好。
446 | Have you heard from him? 你收到他的音訊了嗎?
447 | I'll give you a little tip. 我会给你些提示。
448 | The oranges in this bag are rotten. 這個袋子裡的柳橙都爛了。
449 | There is no point in pretending to be sick. 装病是没用的。
450 | He wrote a book about a jungle adventure. 他写了一本关于丛林冒险的书。
451 | Both you and I are men. 你和我都是男人。
452 | The boy made fun of the girl. 這個男孩取笑了這個女孩。
453 | Second semester has ended. 第二學期結束了。
454 | Maybe you could draw me a picture. 你可以给我画张画。
455 | She cut the apple with a knife. 她用刀子切蘋果。
456 | This house belongs to my uncle. 這棟房子屬於我的叔叔。
457 | That car belongs in a museum. 那辆车属于一家博物馆。
458 | You can borrow my car anytime. 你隨時可以借用我的車。
459 | He is one of my neighbors. 他是我的一個鄰居。
460 | I carried the box on my shoulder. 我把盒子扛在肩上。
461 | Is this snake safe to touch? 摸這條蛇安全嗎?
462 | They told me it was your fault. 他们告诉我这是你的错。
463 | Everybody knows that he is honest. 大家都知道他是誠實的。
464 | I just don't want to lose you. 我只是不想失去你。
465 | I told them to send me another ticket. 我請他們再寄給我一張票。
466 | I saw it in the newspaper. 我在报纸上看到了它。
467 | Tom finally understood it. 汤姆最终明白了。
468 | The teacher pointed her finger at me and asked me to come with her. 教师用手指指着我,要我跟她走。
469 | Have you ever broken your glasses? 你摔坏过你的眼镜吗?
470 | Don't shout at me. 別對著我吼。
471 | News of the recent blast is all over the radio. 收音機廣播充斥著有關最近爆炸的新聞。
472 | Don't you miss anything? 难道你不想念什么吗?
473 | They were not listening to music. 他們沒在聽音樂。
474 | I do not drink coffee. 我不喝咖啡。
475 | Everybody wants to sit beside her. 大家都想坐在她旁边。
476 | You may think those shoes are in fashion, but they aren't. 你可能認為那些鞋子很時髦, 但是他們不是。
477 | He picked flowers for her. 他为她摘了些花。
478 | I can do it in a week. 我可以在一週內做。
479 | You're not satisfied, are you? 你并不满意,对吧?
480 | She weeded the garden. 她给花园除了草。
481 | Afraid of hurting his feelings, I didn't tell him the truth. 怕傷害了他的感情,我沒有告訴他真相。
482 | Tom is a hick. 汤姆是个乡巴佬。
483 | I don't want to miss my train. 我不想错过我的火车。
484 | I want to go to Seattle. 我想去西雅图。
485 | Tom has forgotten how to do that. 汤姆忘了怎么做。
486 | The army forced him to resign. 军队强迫他辞职。
487 | How about going to see a movie tonight? 今晚看电影怎么样?
488 | I need it ASAP. 我尽快需要。
489 | I'm in a bad mood today. 我今天的心情不好。
490 | In the winter, many older people slip on ice and fall down. 在冬天,很多老人在冰上滑倒。
491 | Something might have happened to her. 她可能出什么事了。
492 | Can I see that one? 我能看那個嗎?
493 | There's enough time for a quick snack. 有足夠的時間很快地吃一下點心。
494 | I stayed at home last night. 我昨晚待在家裡。
495 | I don't have the address now. 我沒有現在的地址。
496 | Do you know me? 你还认识我吗?
497 | I remember the first time. 我记得第一次。
498 | I don't think he'll say yes. 我不認為他會說好。
499 | Tom doesn't really love Mary. 汤姆不是真的爱玛丽。
500 | Money talks. 金钱万能。
501 | He hid himself behind the door. 他把自己藏在門後面。
502 | We imported meat from Argentina. 我们从阿根廷进口了肉类。
503 | Christmas is soon. 聖誕節快到了。
504 | I'm in love with her. 我愛上她了。
505 | I ran away in a hurry. 我趕快跑走了。
506 | With the weather getting worse, the departure was put off. 由于天气变差,出发延迟了。
507 | Tom's house has a nice garden. 湯姆的住宅有一個不錯的花園。
508 | I was invited to their wedding. 我被邀請參加他們的婚禮。
509 | I lost. 我迷失了。
510 | I quit smoking two years ago. 我兩年前戒菸了。
511 | Apart from some fruit, he hasn't eaten anything. 除了水果,他什么都没吃。
512 | Why do you need this money? 你為什麼需要這筆錢?
513 | These fireworks are spectacular! 这些焰火真壮观!
514 | He wants more. 他想要更多。
515 | Passengers should board the train now. 乘客应该现在下火车了。
516 | When was this university founded? 这所大学是什么时候建的?
517 | She dived into the swimming pool. 她跳入了游泳池。
518 | You should bring your passport to the bank. 你应该带护照去银行。
519 | I like oranges better than apples. 我喜歡橘子勝過蘋果。
520 | I got a farewell present from everyone. 每個人都送了我一份歡送禮物。
521 | Which tooth hurts? 哪顆牙痛?
522 | Tom didn't have enough money to pay the rent. 汤姆没有足够的钱付租金。
523 | I'm thirsty. 我渴了。
524 | I don't have a prejudice against foreign workers. 我对外籍员工没有偏见。
525 | I see her sweeping the room. 我看见她在打扫房间。
526 | Are you angry? 您生气了吗?
527 | Please pass me the butter. 请把黄油递给我。
528 | I can't do any more than this. 我無法再做下去了。
529 | Let's take the children to the zoo. 讓我們帶孩子們去動物園。
530 | I majored in chemistry at the university. 我在大學主修化學。
531 | He broke the window on purpose. 他故意打破了窗戶。
532 | I always study hard. 我總是用功讀書。
533 | The talks will last three days. 这场谈话将要持续三天。
534 | Please don't tell your parents this. 请不要告诉你父母。
535 | Dog is man's best friend. 狗是人类最好的朋友。
536 | We'll begin work soon. 我們立即開始施工。
537 | In a way you are right, but I still have doubts. 在某种程度上你是对的,但我还是有疑问。
538 | I wrote this book. 我写了这本书。
539 | He told the students to be quiet. 他告訴了學生要安靜。
540 | Didn't you lock up your car? 你没有把你的车上锁吗?
541 | What is the name of this river? 這條河叫什麼名字?
542 | She had died before I arrived. 她在我到達之前去世了。
543 | Take good care of yourself. 照顾好你自己。
544 | My underpants are wet. 我的內褲是濕的。
545 | I don't need you or anybody else. 我不需要你或别的人。
546 | He knows who they are. 他知道他们是谁。
547 | Tom can't afford this. 汤姆买不起这个东西。
548 | Tom didn't want to disappoint Mary. 汤姆不想让玛丽失望。
549 | It was very kind of you to lend me an umbrella. 你借给我伞真好。
550 | He put on the red jacket. 他穿上了紅色的外套。
551 | He offered his help to us. 他想我们提供了帮助。
552 | The moon is the earth's only satellite. 月球是地球唯一的卫星。
553 | April showers bring May flowers. 四月春雨,五月花。
554 | I went to sleep about 10 o'clock. 我在10點左右去睡覺。
555 | I went to a park this morning. 今天早上我去了公园。
556 | What's your name? 您叫什么名字?
557 | Are you afraid of Tom? 你會怕Tom嗎?
558 | We have less than five minutes to evacuate the whole building. 我们有不到五分钟来疏散整栋楼的人。
559 | She has a real knack for getting people to do what she wants. 她真的有本事让别人做她想做的事。
560 | Talking during a concert is rude. 在音樂會中說話是不禮貌的。
561 | You can hold my hand. 你能握我的手。
562 | He will arrive in Paris tomorrow. 他明天將抵達巴黎。
563 | This conference is very important. Don't miss it. 這場會議很重要,不要錯過了!
564 | Her success made her the target of jealousy. 她的成功让她成为嫉妒心的靶子。
565 | He plays the guitar very well. 他吉他彈得很好。
566 | I am disgusted with him. 我厌恶他。
567 | It is a pity that you cannot come to the party. 很遗憾您不能来派对。
568 | You should quit smoking. 你應該戒菸。
569 | What is learned in the cradle is carried to the tomb. 幼年時學的東西,一輩子不會忘記。
570 | He's eating an apple. 他正吃著一個蘋果。
571 | She is always losing her handkerchief. 她老是弄丢她的手帕。
572 | Please return the book by tomorrow. 請明天前還書。
573 | I'm proud of you. 我以你為榮。
574 | She spends her leisure time making dolls. 她利用空闲时间做布娃娃。
575 | Many kinds of birds live in Japan. 許多種鳥類住在日本。
576 | I wrote a letter to my mother. 我寫了一封信給我的母親。
577 | You and I have the same idea. 你和我有相同的想法。
578 | She should have been more careful. 她本來應該更小心的。
579 | I hope everything goes well. 我希望万事如意。
580 | I wish I could talk to you. 我希望能与你谈话。
581 | I would rather not go there alone. 我寧可不要單獨去那裡。
582 | You talk too much. 你說太多了。
583 | You must keep your eyes open. 你得留意一下。
584 | Nobody tried to help Tom. 没人试图帮汤姆。
585 | You two should get married. 你们两个应该结婚。
586 | Tom is probably pretty rich by now. 汤姆现在可能很有钱。
587 | How many does he want? 他要多少?
588 | Thank you for drawing a bird for me. 謝謝你為我畫鳥。
589 | I'm looking for a part-time job. 我正在找一份兼職的工作。
590 | I can read German, but I can't speak it. 我能看懂德语,但不会说。
591 | Something must be wrong with the machinery. 這個機器一定有什麼地方不對勁。
592 | I want beef, too. "我也要牛肉。"
593 | I'd rather stay than go. 我寧願待在這裡而不去。
594 | I had a tennis match with him. 我曾與他的比賽網球。
595 | He's a comedian. 他是喜剧演员。
596 | India is a developing country. 印度是发展中国家。
597 | He is old enough to drive. 他夠開車的年紀了。
598 | I arrived in London. 我到達倫敦了。
599 | I saw five men. 我看到了五個男人。
600 | I got up early in the morning. 我早上很早起床。
601 | I hate chemistry. 我讨厌化学。
602 | May I ask you some more questions? 我可以問你一些問題嗎?
603 | I don't understand. 不明白。
604 | May I say something? 我可以說些什麼嗎?
605 | What is this? 這是什麼啊?
606 | I am going to Hawaii next year. 我明年要去夏威夷。
607 | Don't throw rocks into the river. 不要往河裡扔石頭。
608 | I've had enough of your lying. 我听够你的谎话了。
609 | We had an early lunch at school. 我們在學校有一個早午餐。
610 | The road is too narrow for cars. 這條路太窄汽車無法通行。
611 | I do not agree with you at all. 我完全不赞成你的意见。
612 | Are you saying you don't want to go to Tom's party? 你的意思是你不想去汤姆的派对?
613 | Tom knew when Mary would be arriving. 汤姆知道玛丽什么时候来。
614 | Pride goes before a fall. 骄傲使人落后。
615 | Why did you turn down his offer? 你為什麼拒绝了他的提議?
616 | Japan is a rich country. 日本是个富有的国家。
617 | I can't stand that noise. 我不能忍受那個噪音。
618 | You'd better go. 你最好走。
619 | Turn off the TV. 关闭电视机
620 | I don't want to work. 他不爱劳动。
621 | I hope to see you. 我希望能见到你。
622 | Tom got a late start this morning. 汤姆今早出发晚了。
623 | He is not from Hokkaido. 他不是來自北海道。
624 | What subjects are you taking at school? 你在學校裡讀哪些科目?
625 | Get Tom. 找到汤姆。
626 | What is the total amount of money you spent? 你一共花了多少钱?
627 | He likes taking care of the garden. 他喜歡照顧花園。
628 | Her red dress made her stand out. 她的紅色禮服使她引人注目。
629 | At last, she solved the problem. 最後,她解決了這個問題。
630 | He made many excuses for being late. 他為遲到找了很多的藉口。
631 | Will you all be here tomorrow? 明天你一整天都會在這裡嗎?
632 | She gives me a nasty look every time she sees me. 她每次看見我都給露出厭惡的眼神。
633 | Close the door when you leave. 出去的时候把门关上。
634 | They passed by her house yesterday. 他們昨天路過她家。
635 | Excuse me, I'm lost. 不好意思,我迷路了。
636 | This is what I can do for you. 这是我能为您做的。
637 | We gladly accept your offer. 我们很高兴接受你的提议。
638 | I want to drink something cold. 我想喝冷飲。
639 | You look tired. 你看起來很疲倦。
640 | I'll go shopping tomorrow. 我明天要去购物。
641 | I would like to know her name. 我想知道她的名字。
642 | My father has five siblings. 我父親有五個兄弟姐妹。
643 | She glanced briefly at the newspaper. 她很快地瞟了一眼報紙。
644 | The parking lot is free of charge. 停车场是免费的。
645 | I will pick you up around six. 我會在六點鐘左右接你。
646 | We don't have any other choice. 我们别无选择。
647 | The sun rose over the horizon. 太陽升出了地平線。
648 | Do you feel tired? 你覺得累嗎?
649 | A green carpet will not go with this blue curtain. 绿色的毯子和这条蓝色的帘子不配。
650 | Maybe you'll succeed. 也許你會成功。
651 | I feel very sorry for your sister. 我對你姐姐感到非常遺憾。
652 | That red dress suited her. 那件紅色的洋裝適合她。
653 | You need a car if you live in the suburbs. 要住在郊区的话,汽车是必需的。
654 | How come you didn't say anything? 你為什麼都不說話?
655 | I meet him at the club. 我在俱樂部見到他。
656 | Today was fun. 今天很有趣。
657 | I don't know how to go there. 我不知道怎么去那儿。
658 | Give us two knives and four forks, please. 请给我们两把刀和四把叉。
659 | She made him a new suit. 她為他做了一套新衣服。
660 | I want to see your older sister. 我想見你姐姐。
661 | Her argument was not based on facts. 她的观点没有基于事实。
662 | The cost of living has risen. 生活費升高了。
663 | Don't show your face around here again. 你不要再出現在這裡了。
664 | I want you to stay here until I get back. 我想你待在这里直到我回来。
665 | Do you know what I mean? 你知道我的意思嗎?
666 | The accident's only survivor was a baby. 这场灾难的唯一幸存者是一个婴儿。
667 | It's just what I wanted. 我想要的就是这个。
668 | How about playing chess tonight? 今晚下棋怎麼樣?
669 | Actions speak louder than words. 行動勝於雄辯。
670 | Tom is waiting for everyone to leave. 汤姆等着每个人都离开。
671 | I don't mind hot weather. 我不在乎炎熱的天氣。
672 | He is always losing his umbrella. 他總是搞丟了他的傘。
673 | My birthday falls on Sunday. 我的生日在星期日。
674 | He plays baseball every day. 他每天都打垒球。
675 | Hokkaido is very far, isn't it? 北海道很遠,不是嗎?
676 | She's neither rich nor famous. 她既没钱,也不出名。
677 | I went skiing at Zao last winter. 我去年冬天去藏王滑雪。
678 | Please show me the menu. 請給我菜單。
679 | Give it to them. 把它给他们。
680 | I can't stand this heat. 我受不了這麼熱。
681 | Does this bus go to the beach? 這輛公車去海灘嗎?
682 | Please show me around. 請帶我到處看看。
683 | Take as many peaches as you like. 你想拿多少桃子就拿多少。
684 | I heard the song sung in French. 我听过这首歌的法语版。
685 | She's absent because she's sick. 她不在是因为病了。
686 | The problem was very difficult. 這個問題非常困難。
687 | I live in this neighborhood. 我住在這附近。
688 | You're a real friend. 你是個真正的朋友。
689 | The robot went out of control. 這個機器人失控了。
690 | If it happened to Tom, it could happen to you. 如果那發生在Tom身上,也可能會發生在你身上。
691 | He goes to school on foot. 他步行上学。
692 | I just talked to the person in charge. 我刚跟主管人谈过。
693 | He's an excellent brain surgeon. 他是一個優秀的腦外科醫生。
694 | Under no circumstances must you leave the room. 在任何情況下你都不能離開這個房間。
695 | Please call me up between seven and eight. 请在7点到8点钟之间打电话给我。
696 | They became friends in elementary school. 他們在小學時就是朋友了。
697 | They live across the river. 他們住在河對面。
698 | He is alone. 他獨自一人。
699 | He is always at home on Mondays. 他星期一總是在家。
700 | Give me some milk, too. 也給我一些牛奶。
701 | He made friends with her in America. 他與她在美國成為朋友。
702 | Are you planning to take part in the meeting? 你们准备参加会议吗?
703 | I'm sure that it'll be fun. 我确定它将会很有趣
704 | You should pay more attention to his warnings. 你應該多注意他的警告。
705 | His mother looks young for her age. 他媽媽看起來比實際年齡年輕。
706 | It happened at a quarter past eleven. 它發生在十一點一刻。
707 | I know that it is highly unlikely that anyone knows me. 我知道有人认识我的可能性微乎其微。
708 | We kept quiet. 我们保持了沉默。
709 | What's she doing? 她在做什麼?
710 | This is not a sentence. 這不是一個句子。
711 | Cover your head when you are in the sun. 當你在陽光下的時候,遮住你的頭。
712 | She believes that he is innocent. 她相信他是無辜的。
713 | The taxi has arrived. 出租车到了。
714 | Why did you paint the bench red? 为什么你把长凳漆成红色了?
715 | I'd like a window seat, please. 請給我靠窗口的位子。
716 | She always gets her own way. 她總是隨心所欲。
717 | He rushed out of the office. 他急忙出了办公室。
718 | Lansing is the state capital of Michigan. 蘭辛是密西根州的首府。
719 | He has no interest in politics. 他對政治沒有興趣。
720 | What I'm about to say is strictly between you and me. 我要说的只能是你知我知。
721 | He thanked me for coming. 他感謝我的到來。
722 | I threw a stone at the bird. 我扔了块石头打向鸟儿。
723 | I didn't get along with her. 我沒有和她相處過。
724 | We didn't break in. 我们没有打断。
725 | How did you draw this picture? 你怎么画这幅画?
726 | That's too bad. 那太糟糕了。
727 | He lives somewhere about here. 他住在這附近某個地方。
728 | The deadline is approaching. 期限近了。
729 | He was caught cheating in the exam. 他在考試中作弊時被抓到了。
730 | It is imperative for you to act at once. 您必须马上行动。
731 | I want to see him very much. 我非常想见到他。
732 | He likes to cook for his family. 他喜歡為家人做飯。
733 | I'm currently a teacher at this school. 我现在在这所学校任教。
734 | I was surprised at his strong resemblance to his father. 他像极了他父亲,这让我震惊。
735 | Her mother always accompanies her. 她母亲一直陪着她。
736 | We set a trap to catch a fox. 我们设了个陷阱来抓狐狸。
737 | Do I have to stay in the hospital? 我必須留在醫院嗎?
738 | Eighty percent of all information on computers around the world is in English. 全世界百分之八十電腦上的資訊都是用英語寫的。
739 | She really wants to go. 她特别想去。
740 | Can I speak to the head nurse? 我能跟護士長說話嗎?
741 | He didn't show up at the party. 他沒有在派對上出現。
742 | Please wrap it like a Christmas present. 請把它包裝得像一個聖誕禮物。
743 | Alcohol consumption is increasing every year. 酒的消费每年都在上升。
744 | Monopoly is a popular game for families to play. 大富翁是一個家庭玩的熱門遊戲。
745 | Don't beat around the bush. 不要拐彎抹角。
746 | I have eight brothers and sisters. 我有八個兄弟姐妹。
747 | Hurry, and you will catch the train. 快一点,你就能赶上火车了。
748 | No one voted against it. 没有人投反对票。
749 | She asked him questions. 她問了他問題。
750 | I am quite all right now. 我一切都很好。
751 | I can play Chopin. 我會彈蕭邦。
752 | He was wounded in the war. 他是在战争中受伤的。
753 | I don't like the taste of onions. 我不喜歡洋蔥的味道。
754 | He cannot afford to buy a car, much less a house. 他买不起一辆汽车,更不要说一套房子了。
755 | Don't hesitate to ask questions. 不要猶豫去問問題。
756 | Aren't you Tom? 你不是湯姆嗎?
757 | He's stronger than you. 他比你強壯。
758 | Your assistance is indispensable for us. 您的帮助对我们来说是必不可少的。
759 | Why am I still here? 为什么我还在这里?
760 | Watching TV is a passive activity. 看電視是一種被動的活動。
761 | What have you come here for? 你为什么来这儿?
762 | I want to ask you some questions about Tom. 我想问你一些关于汤姆的问题。
763 | Would you pass the salt, please? 請你把鹽遞過來好嗎?
764 | There is a television in the room. 房里有个电视机。
765 | Could you please speak a little bit more slowly? 能否請你說慢一點?
766 | I forgot the date of the meeting. 我忘了會議的日期。
767 | What is love? 愛是什麼?
768 | My legs still hurt. 我的腿还是很痛。
769 | Between you and me, I don't like our new team captain. 我就只告诉你,我不喜欢我们的新队长。
770 | Are you really going to London to study? 你要去伦敦读书是真的吗?
771 | He has already had lunch. 他已经吃过午饭了。
772 | I owe him 1,000 dollars. 我欠他1000美元。
773 | I don't have to clean my room. 我不用打扫房间。
774 | He is not an American. 他不是美国人。
775 | Turn off the gas. 把煤气关了!
776 | I have met him before. 我以前见过他。
777 | It's not that easy to learn a new language after fifty. 五十岁以后学一门新的语言不是那么容易。
778 | It's impossible to get there by noon. 中午到達那裡是不可能的。
779 | The teacher demonstrated the idea with an experiment. 这位老师用试验论证了这个想法。
780 | Everyone noticed. 所有人都注意到了。
781 | He plays tennis very well. 他打网球打得很好。
782 | That's good, isn't it? 那太好了,不是嗎?
783 | Get me a ticket, please. 請給我一張票。
784 | You only gave me fifty cents. 你只給了我五十美分。
785 | That's an interesting idea. 那是个有趣的主意。
786 | I clapped my hands. 我拍手。
787 | You could've answered that question. 你本可以回答那问题。
788 | What's your marital status? 能告诉我你的婚姻状况吗?
789 | I do not remember seeing the letter, but perhaps I read it. 我不記得我看過這封信, 但或許我讀過它。
790 | She might know that we are here. 她或许知道我们在这儿。
791 | I was ashamed of my behavior. 我对自己的行为感到羞愧。
792 | He hasn't answered my letter yet. 他还没回我的信。
793 | This writer is Russian. 这个作家是俄罗斯人。
794 | The cat got through the hedge. 貓從樹籬穿過去。
795 | Will it take long to recover? 復原需要花很長的時間嗎?
796 | I envied his new house. 我羨慕他的新房子。
797 | When does that start? 什么时候开始?
798 | She doesn't have any enemies. 她没有敌人。
799 | Winds from the sea are moist. 从海洋吹来的风感觉湿漉漉的。
800 | Do you have these shoes in my size? 你們這款鞋子有我的尺寸嗎?
801 | I wish you'd go. 我希望你去。
802 | I will stay until tomorrow. 我会待到明天。
803 | I'll prove it to you. 我会给你证明的。
804 | He kept his promise and helped his brothers. 他履行了他的承诺,并且帮助了他的兄弟。
805 | Where in Canada are you from? 你來自加拿大的哪裡?
806 | How about some more roast beef? 再多一些烤牛肉怎麼樣?
807 | Have you ever seen Tokyo Tower? 你曾看過東京鐵塔嗎?
808 | My mother is a good woman. 我媽媽是個好女人。
809 | I was unable to go to his birthday party. 我那时没法去他的生日派对。
810 | Are you planning to help them? 你打算幫助他們嗎?
811 | What time will you have breakfast? 你什麼時候吃早餐?
812 | Why me? 为什么是我?
813 | This book is beautifully illustrated. 这本书有精美的插图。
814 | The fox and the bear lived together. 這隻狐狸和這隻熊一起生活了。
815 | Do you think I'm crazy? 你认为我疯了吗?
816 | Read it once more, please. 請再讀一次。
817 | It looks like a duck. 它看起來像一隻鴨子。
818 | Are you looking for work? 你是在找工作吗?
819 | My father walks. 我爸爸走路。
820 | Hopefully, we'll enjoy our China trip. 希望我們會喜歡我們的中國之旅。
821 | Why don't you go home? 你为什么不回家呢?
822 | I'm going to check. 我正要去签到。
823 | Tom's uncle keeps a lot of sheep. 湯姆的叔叔養了很多羊。
824 | I'll treat you. 我請你。
825 | Could you take this, please? 請你拿這個好嗎?
826 | He arrived after I had left. 我走之后他到达了。
827 | I need someone to talk with. 我得找人商量一下。
828 | Skip it. 不管它。
829 | Would you like some salad? 你要來點兒沙拉嗎?
830 | I sometimes watch TV. 我有時看電視。
831 | I'm full. 我吃飽了。
832 | He is a learned man. 他是个有教养的人。
833 | I was very busy last week. 上星期我非常地忙。
834 | I will wait. 我會等。
835 | It's too dark to play baseball now. 現在太暗無法打棒球。
836 | Children like fruit juice. 孩子們喜歡果汁。
837 | She wouldn't let up until I agreed to go to the movies with her. 她不愿松开我,直到我同意和她去电影院。
838 | He asked me a question. 他問了我一個問題。
839 | Tom bought a mobile phone. 汤姆买了个手机。
840 | I hope people are satisfied. 我希望人们满意。
841 | My father retired at the age of 65. 我的父亲65岁的时候退休了。
842 | The poor people were at the mercy of the cruel dictator. 可憐的人民任憑殘暴的獨裁者處置。
843 | It's clear that there's a rather strong disagreement between the two. 很明顯的是這兩者之間有很強烈的分歧。
844 | That's no big deal. 那没什么大不了的。
845 | What she said did not make sense. 她說的話沒有道理。
846 | He is good at playing tennis. 他打网球打得很好。
847 | I recognized some of the tunes that Tom played. 我认出了些汤姆演奏的调子。
848 | I lived in Japan three years ago. 我三年前住在日本。
849 | Yesterday I spent the whole day working. 昨天我一整天都在工作。
850 | It's not something anyone can do. 这不是任何人都能做的事。
851 | Tom doesn't have a fever. 汤姆没有发烧。
852 | "Will he pass the examination?" "I am afraid not." "他會通過考試嗎?" "我怕是不會。"
853 | I'd like a glass of water, please. 我要一杯水,謝謝。
854 | They fell into the conversation immediately. 他们很快就聊起来了。
855 | What did you do last night? 你昨天晚上做什麼?
856 | He knows how to swim. 他会游泳。
857 | They got married and settled near Boston. 他們結了婚並定居在波士頓附近。
858 | I've heard that name somewhere before. 我在别处听过那个名字。
859 | I'll be in my office from ten tomorrow. 我明天十点起会在办公室里。
860 | Does he go to school on foot or by bicycle? 他走路去学校还是骑车去学校?
861 | I haven't started anything yet. 我还没开始做任何事。
862 | He often takes me for a drive. 他常常載我去兜風。
863 | How much money do you have? 你有多少錢?
864 | You were at my wedding. 你出席了我的婚礼。
865 | There is a good chance that gasoline prices will go up. 油价上涨的可能性很高。
866 | My plan is to buy a car. 我打算買輛車。
867 | He is a famous baseball player. 他是一位著名的棒球選手。
868 | He told me to wash my face. 他叫我洗脸。
869 | Thank you for your invitation. 感謝您的邀請。
870 | He went to New York on Monday. 他星期一去了紐約。
871 | Be quiet, or the baby will wake up. 安靜,否則嬰兒會醒來。
872 | English is easy to learn. 英語簡單易學。
873 | I just wish we could leave this horrible place. 我只是希望我们能离开这个可怕的地方。
874 | I have absolute trust in you. 我絕對信任你。
875 | She couldn't answer the question. 她無法回答這個問題。
876 | I sometimes still think about her. 有時候,我還是會想起她。
877 | He has been very busy this week. 他這個星期一直很忙碌。
878 | Everybody is immune to smallpox nowadays. 現今每個人都對天花免疫了。
879 | We should call the police. 我们该报警。
880 | Your proposal is worthy of being considered. 你的提议值得考虑。
881 | She looks unhappy. 她看起來不快樂。
882 | The taxi picked up two passengers. 這輛計程車載了兩名乘客。
883 | I've always wanted to climb Mt. Fuji. 我一直想登富士山。
884 | They greeted me with a smile. 他們面帶微笑向我打招呼。
885 | My father has a restaurant. 我父亲有家餐馆。
886 | I don't expect anything from you. 我不指望从你身上得到什么。
887 | He has been to Hokkaido. 他曾去過北海道。
888 | Tom gave me a pen. 湯姆給了我一枝筆。
889 | I go to church every day. 我每天上教堂。
890 | She's dieting. 她在节食中。
891 | He likes watching TV. 他喜歡看電視。
892 | Tom should've already discussed that with you. 汤姆应该已经跟你讨论过那事了。
893 | I've finished reading the book. 我看完了这本书。
894 | Our school's principal is very old. 我校校长很老了。
895 | You'll get used to it. 你会习惯的。
896 | Tom's house has three bedrooms. Tom的房子有三個房間。
897 | He's a tall boy. 他是一个高大的男孩。
898 | I can't believe you are eating something the doctor has told you repeatedly you shouldn't eat. 真的不敢相信你在吃医生叮嘱不要食用的东西。
899 | You don't need to work on Sundays. 星期天的時候,你不用工作。
900 | Don't throw a stone at the dog. 不要对狗丢石头。
901 | He makes the most of his opportunities. 他充分利用他的機會。
902 | Black suits you. 黑色很衬你。
903 | Give me something to write with. 給我些可以寫字的東西。
904 | They must have made a mistake. 他們一定是犯錯了。
905 | Would you like to drink anything? 你想喝點什麼嗎?
906 | How much sugar do you use? 你用多少糖?
907 | She showed the visitor her baby. 她给客人看了她的宝宝。
908 | You have to be patient. 你必須有耐心。
909 | Could I borrow a pencil? 我能借支铅笔吗?
910 | All plants need water and light. 所有的植物都需要陽光和水。
911 | One of your buttons has come off. 你的一個按鈕脫落了。
912 | You know too much. 你知道得太多了。
913 | We didn't know what to do next. 我們不知道下一步要做什麼。
914 | This is not important. 這個不重要。
915 | He died of lung cancer. 他死於肺癌。
916 | Have you had your eyesight checked recently? 你最近检查视力了吗?
917 | What's the weight of your suitcase? 你的行李多重?
918 | What happened to you last night? 昨晚你發生了什麼事?
919 | I don't smoke weed. 我不吸大麻。
920 | I asked her out on a date. 我請她出去約會。
921 | I don't know if I'll have time to do it. 我不知道我是否有时间做。
922 | Excuse me for interrupting you. 對不起,打擾你了。
923 | Do you want to be rich? 你想致富嗎?
924 | It's on the sofa. 它在沙發上。
925 | I knew what Tom was doing. 我知道汤姆在做什么。
926 | I respect the elderly. 我尊敬长辈。
927 | A bookstore in that location wouldn't make enough money to survive. 那個地點的書店無法賺足夠的錢生存下去。
928 | I know the problem. 我知道问题。
929 | Where did you see her? 你在哪儿看到了她?
930 | Will you switch seats with me? 您愿意跟我换座位吗?
931 | She has been absent since last Wednesday. 从上周三起,她一直缺席。
932 | Tom hugged Mary. 汤姆拥抱了玛丽。
933 | My father made me a delicious lunch. 我父親為我做了一頓美味的午餐。
934 | The wind calmed down. 风停了。
935 | Be careful not to drive the wrong way on a one-way street. 小心不要在单行道逆向行驶。
936 | A plastic dish will melt on the stove. 塑料盘子在烤箱里会化的。
937 | It cost him 50 dollars to rent a car in Hawaii. 他花了50美元在夏威夷租了一辆汽车。
938 | You work as hard as he did at your age. 你跟他在你這個年紀時一樣努力工作。
939 | The United States is abundant in natural resources. 美国的自然资源很丰富。
940 | Please write with a pen. 请用钢笔写。
941 | I want Tom to win. 我想让汤姆赢。
942 | Many of the immigrants changed their names. 许多移民改了名字。
943 | French is my mother tongue. 法语是我的母语。
944 | My brother takes care of our dog. 我弟弟照顧我們的狗。
945 | Give some meat to the dog. 給這隻狗一些肉。
946 | It's a pleasure to have you with us again. 真高兴你又跟我们在一起了。
947 | We generally drink tea after a meal. 我們通常飯後喝茶。
948 | I really need a drink now. 我现在真需要来杯喝的。
949 | Look at that tall building. 看那棟高樓。
950 | He and his sisters are currently living in Tokyo. 他和他的姐妹们目前都住在东京。
951 | I've lost my pen. 我弄丟了我的筆。
952 | Those are my trousers. 那些都是我的褲子。
953 | Don't be cruel to animals. 不要虐待动物。
954 | Mt. Fuji is Japan's tallest mountain. 富士山是日本最高的山。
955 | Forewarned is forearmed. 凡事要預先準備好。
956 | What prevented you from coming earlier? 什麼阻止你早點來了?
957 | I have a high fever. 我烧得很厉害。
958 | Our school is near the station. 我們學校在車站的附近。
959 | None of these buses go to Shinjuku. 這些巴士中沒有一輛去新宿。
960 | I've got things under control. 我控制住了。
961 | It's 3:30. 3点半了。
962 | That child has few friends. 那孩子沒有什麼朋友。
963 | Being a good conversationalist does not just mean being a good speaker of English. 作為一個良好的交談者,並不只意味著作一個英語說得好的說話者。
964 | My parents have gone to the airport to see my uncle off. 我父母去机场送我叔叔了。
965 | Shortly after the accident, the police came. 事故發生後不久,警察來了。
966 | Not everything can be bought with money. 不是所有的東西都可以用金錢買到。
967 | Professional writers do not have a regular income. 專職作家沒有固定的收入。
968 | I saw a strange woman there. 我看见一位陌生女人在那儿。
969 | You have to speak English here. 你在這裡必須說英語。
970 | What did you major in at college? 你大學時主修什麼?
971 | Tom probably knew who I was. 汤姆可能知道我是谁。
972 | You are her daughters. 你是她的女兒。
973 | Stand up. 起立。
974 | Classes begin next week. 课程下周开始。
975 | I am afraid of death. 我怕死。
976 | Please do not open the windows. 请不要开窗。
977 | Write it down before you forget it. 在你忘記之前把它寫下來。
978 | I have not heard from her recently. 我最近沒有收到她的信。
979 | Don't open that. 别打开那个。
980 | Rice is sold by the kilogram. 米以公斤為單位來出售。
981 | I know what you did. 我知道你做了什么。
982 | I'd like seats on the first floor. 我想要一樓的座位。
983 | Please send me a letter. 請寄信給我。
984 | We go to the same school. 我們上同一所學校。
985 | I asked for a table over there. 我在那裡要了一張桌子。
986 | She stayed in the house all day. 她整天待在房子裡。
987 | What time do you leave for school? 你幾點鐘去學校?
988 | Tom didn't understand what the teacher said. 汤姆没明白老师说了什么。
989 | What souvenir do you think she would like most? 你觉得她最想要什么纪念品?
990 | She has an eye for beauty. 她有一雙美麗的眼睛。
991 | They were carelessly unaware of the danger. 他们粗心大意,还没意识到危险。
992 | Let me know when you need me again. 再需要我就告诉我。
993 | I don't want to be friends with you. 我不想跟你交朋友。
994 | I'm old enough to live by myself. 我年紀夠大了可以自己一個人住。
995 | Does the letter need to be written in English? 這封信需要用英文寫嗎?
996 | I should be happy. 我該高興。
997 | Rome wasn't built in a day. 罗马不是一天建成的。
998 | A little walk will give you a good appetite for breakfast. 散散步將會給你很好的食慾吃早餐。
999 | You should talk directly to Tom. 你应该直接跟汤姆说。
1000 | Are you able to play organ? 你能演奏管风琴吗?
1001 |
--------------------------------------------------------------------------------
/nmt/en-fr/_about.txt:
--------------------------------------------------------------------------------
1 | ** Info **
2 |
3 | Check for newest version here:
4 | http://www.manythings.org/anki/
5 | Date of this file:
6 | 2018-10-27
7 |
8 | This data is from the sentences_detailed.csv file from tatoeba.org.
9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 |
11 |
12 |
13 | ** Terms of Use **
14 |
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 |
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 |
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 |
24 |
25 |
26 | ** Warnings **
27 |
28 | The data from the Tatoeba Project contains errors.
29 |
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 |
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 |
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 |
45 |
46 |
47 | ** Downloading Anki **
48 |
49 | See http://ankisrs.net/
50 |
51 |
52 |
53 | ** Importing into Anki **
54 |
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 |
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 |
--------------------------------------------------------------------------------