├── .gitignore ├── 1.Emotional analysis+lstm+pytorch.ipynb ├── 2.使用Bert预训练模型微调中文文本分类.ipynb ├── 3.Pytorch+seq2seq机器翻译模型+attention+英翻中.ipynb ├── Bert手写版本+MLM+NSP.ipynb ├── README.md └── nmt ├── en-cn ├── cmn.txt ├── dev.txt ├── dev_mini.txt ├── test.txt ├── test_mini.txt ├── train.txt └── train_mini.txt └── en-fr ├── _about.txt └── fra.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /1.Emotional analysis+lstm+pytorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "lstm.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyPPNzC0+L5ZLvET9UfvEjd1", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "1X9yjs5_7z4p", 33 | "colab_type": "code", 34 | "colab": { 35 | "base_uri": "https://localhost:8080/", 36 | "height": 800 37 | }, 38 | "outputId": "cb78ac54-4c37-4fff-ee10-6780fba2a77c" 39 | }, 40 | "source": [ 41 | "!pip install torch\n", 42 | "!pip install torchtext\n", 43 | "!python -m spacy download en\n", 44 | "!pip install torchvision\n", 45 | "\n", 46 | "# K80 gpu for 12 hours\n", 47 | "import torch\n", 48 | "from torch import nn, optim\n", 49 | "from torchtext import data, datasets\n", 50 | "\n", 51 | "print('GPU:', torch.cuda.is_available())\n", 52 | "\n", 53 | "torch.manual_seed(123)" 54 | ], 55 | "execution_count": 1, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (1.5.1+cu101)\n", 61 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch) (0.16.0)\n", 62 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch) (1.18.5)\n", 63 | "Requirement already satisfied: torchtext in /usr/local/lib/python3.6/dist-packages (0.3.1)\n", 64 | "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchtext) (1.5.1+cu101)\n", 65 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from torchtext) (2.23.0)\n", 66 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from torchtext) (4.41.1)\n", 67 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchtext) (1.18.5)\n", 68 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->torchtext) (0.16.0)\n", 69 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (2020.6.20)\n", 70 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (2.10)\n", 71 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (1.24.3)\n", 72 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext) (3.0.4)\n", 73 | "Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in /usr/local/lib/python3.6/dist-packages (2.2.5)\n", 74 | "Requirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.6/dist-packages (from en_core_web_sm==2.2.5) (2.2.4)\n", 75 | "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.18.5)\n", 76 | "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n", 77 | "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.7.1)\n", 78 | "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.23.0)\n", 79 | "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n", 80 | "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n", 81 | "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n", 82 | "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)\n", 83 | "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)\n", 84 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (49.1.0)\n", 85 | "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)\n", 86 | "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)\n", 87 | "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.41.1)\n", 88 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n", 89 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n", 90 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n", 91 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.24.3)\n", 92 | "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.7.0)\n", 93 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)\n", 94 | "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", 95 | "You can now load the model via spacy.load('en_core_web_sm')\n", 96 | "\u001b[38;5;2m✔ Linking successful\u001b[0m\n", 97 | "/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->\n", 98 | "/usr/local/lib/python3.6/dist-packages/spacy/data/en\n", 99 | "You can now load the model via spacy.load('en')\n", 100 | "Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages (0.6.1+cu101)\n", 101 | "Requirement already satisfied: torch==1.5.1 in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.5.1+cu101)\n", 102 | "Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision) (7.0.0)\n", 103 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.18.5)\n", 104 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch==1.5.1->torchvision) (0.16.0)\n", 105 | "GPU: True\n" 106 | ], 107 | "name": "stdout" 108 | }, 109 | { 110 | "output_type": "execute_result", 111 | "data": { 112 | "text/plain": [ 113 | "" 114 | ] 115 | }, 116 | "metadata": { 117 | "tags": [] 118 | }, 119 | "execution_count": 1 120 | } 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "metadata": { 126 | "id": "4HovvE-c8BeQ", 127 | "colab_type": "code", 128 | "colab": { 129 | "base_uri": "https://localhost:8080/", 130 | "height": 136 131 | }, 132 | "outputId": "55007511-700b-400f-890b-afacefedf08f" 133 | }, 134 | "source": [ 135 | "# 为CPU设置随机种子\n", 136 | "torch.manual_seed(123)\n", 137 | "\n", 138 | "# 两个Field对象定义字段的处理方法(文本字段、标签字段)\n", 139 | "TEXT = data.Field(tokenize='spacy') # 分词\n", 140 | "LABEL = data.LabelField(dtype=torch.float)\n", 141 | "\n", 142 | "# IMDB共50000影评,包含正面和负面两个类别。数据被前面的Field处理\n", 143 | "# 按照(TEXT, LABEL) 分割成 训练集,测试集\n", 144 | "train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n", 145 | "\n", 146 | "print('len of train data:', len(train_data)) # 25000\n", 147 | "print('len of test data:', len(test_data)) # 25000\n", 148 | "\n", 149 | "# torchtext.data.Example : 用来表示一个样本,数据+标签\n", 150 | "print(train_data.examples[15].text) # 文本:句子的单词列表\n", 151 | "print(train_data.examples[15].label) # 标签: 积极" 152 | ], 153 | "execution_count": 3, 154 | "outputs": [ 155 | { 156 | "output_type": "stream", 157 | "text": [ 158 | "downloading aclImdb_v1.tar.gz\n" 159 | ], 160 | "name": "stdout" 161 | }, 162 | { 163 | "output_type": "stream", 164 | "text": [ 165 | "aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.0MB/s]\n" 166 | ], 167 | "name": "stderr" 168 | }, 169 | { 170 | "output_type": "stream", 171 | "text": [ 172 | "len of train data: 25000\n", 173 | "len of test data: 25000\n", 174 | "['The', 'movie', 'is', 'a', 'bit', '\"', 'thin', '\"', 'after', 'reading', 'the', 'book', ',', 'but', 'it', \"'s\", 'still', 'one', 'of', 'the', 'greatest', 'movies', 'ever', 'made', '.', 'Sheryl', 'Lee', 'is', 'beautiful', 'and', 'Nick', 'Nolte', 'is', 'really', '\"', 'vonneguty', '\"', '.', 'He', 'makes', 'great', 'job', 'expressing', 'the', 'feelings', 'from', 'the', 'book', 'to', 'the', 'film', '.', 'Not', 'many', 'films', 'engage', 'the', 'feeling', 'of', 'the', 'book', 'as', 'well', 'as', 'Mother', 'Night', 'does', '.']\n", 175 | "pos\n" 176 | ], 177 | "name": "stdout" 178 | } 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "metadata": { 184 | "id": "HbKax6528BuX", 185 | "colab_type": "code", 186 | "colab": { 187 | "base_uri": "https://localhost:8080/", 188 | "height": 100 189 | }, 190 | "outputId": "994a91d4-891f-4082-f9ba-b98026572c22" 191 | }, 192 | "source": [ 193 | "TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')\n", 194 | "LABEL.build_vocab(train_data)\n", 195 | "print(len(TEXT.vocab)) # 10002\n", 196 | "print(TEXT.vocab.itos[:12]) # ['', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']\n", 197 | "print(TEXT.vocab.stoi['and']) # 5\n", 198 | "print(LABEL.vocab.stoi) # defaultdict(None, {'neg': 0, 'pos': 1})" 199 | ], 200 | "execution_count": 4, 201 | "outputs": [ 202 | { 203 | "output_type": "stream", 204 | "text": [ 205 | ".vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s] \n", 206 | "100%|█████████▉| 398894/400000 [00:16<00:00, 23519.32it/s]" 207 | ], 208 | "name": "stderr" 209 | }, 210 | { 211 | "output_type": "stream", 212 | "text": [ 213 | "10002\n", 214 | "['', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']\n", 215 | "5\n", 216 | "defaultdict(, {'neg': 0, 'pos': 1})\n" 217 | ], 218 | "name": "stdout" 219 | } 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "metadata": { 225 | "id": "vfv4g5Si8B7I", 226 | "colab_type": "code", 227 | "colab": {} 228 | }, 229 | "source": [ 230 | "batchsz = 30\n", 231 | "device = torch.device('cuda')\n", 232 | "train_iterator, test_iterator = data.BucketIterator.splits(\n", 233 | " (train_data, test_data),\n", 234 | " batch_size = batchsz,\n", 235 | " device=device\n", 236 | " )" 237 | ], 238 | "execution_count": 18, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "id": "Ei2LDRY18B4y", 245 | "colab_type": "code", 246 | "colab": {} 247 | }, 248 | "source": [ 249 | "class RNN(nn.Module):\n", 250 | "\n", 251 | " def __init__(self, vocab_size, embedding_dim, hidden_dim):\n", 252 | " super(RNN, self).__init__()\n", 253 | "\n", 254 | " # [0-10001] => [100]\n", 255 | " # 参数1:embedding个数(单词数), 参数2:embedding的维度(词向量维度)\n", 256 | " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", 257 | " # [100] => [256]\n", 258 | " # 双向LSTM,所以下面FC层使用 hidden_dim*2\n", 259 | " self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,\n", 260 | " bidirectional=True, dropout=0.5) \n", 261 | " # [256*2] => [1]\n", 262 | " self.fc = nn.Linear(hidden_dim*2, 1)\n", 263 | " self.dropout = nn.Dropout(0.5)\n", 264 | "\n", 265 | " def forward(self, x):\n", 266 | " \"\"\"\n", 267 | " x: [seq_len, b] vs [b, 3, 28, 28]\n", 268 | " \"\"\"\n", 269 | " # [seq_len, b, 1] => [seq_len, b, 100]\n", 270 | " embedding = self.dropout(self.embedding(x))\n", 271 | "\n", 272 | " # output: [seq, b, hid_dim*2]\n", 273 | " # hidden/h: [num_layers*2, b, hid_dim]\n", 274 | " # cell/c: [num_layers*2, b, hid_dim]\n", 275 | " output, (hidden, cell) = self.rnn(embedding)\n", 276 | " # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]\n", 277 | " # 双向,所以要把最后两个输出连接\n", 278 | " hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)\n", 279 | " # [b, hid_dim*2] => [b, 1]\n", 280 | " hidden = self.dropout(hidden)\n", 281 | " out = self.fc(hidden)\n", 282 | "\n", 283 | " return out" 284 | ], 285 | "execution_count": 6, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "Hb7WYVEmCYqG", 292 | "colab_type": "code", 293 | "colab": { 294 | "base_uri": "https://localhost:8080/", 295 | "height": 50 296 | }, 297 | "outputId": "a2ebe150-3aff-4880-a4bc-06f1ada66856" 298 | }, 299 | "source": [ 300 | "rnn = RNN(len(TEXT.vocab), 100, 256) #词个数,词嵌入维度,输出维度\n", 301 | "\n", 302 | "pretrained_embedding = TEXT.vocab.vectors\n", 303 | "print('pretrained_embedding:', pretrained_embedding.shape) # torch.Size([10002, 100])\n", 304 | "\n", 305 | "# 使用预训练过的embedding来替换随机初始化\n", 306 | "rnn.embedding.weight.data.copy_(pretrained_embedding)\n", 307 | "print('embedding layer inited.')" 308 | ], 309 | "execution_count": 19, 310 | "outputs": [ 311 | { 312 | "output_type": "stream", 313 | "text": [ 314 | "pretrained_embedding: torch.Size([10002, 100])\n", 315 | "embedding layer inited.\n" 316 | ], 317 | "name": "stdout" 318 | } 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": { 324 | "id": "k_OIamPtJP3L", 325 | "colab_type": "text" 326 | }, 327 | "source": [ 328 | "" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "metadata": { 334 | "id": "rbnpMTyhCYny", 335 | "colab_type": "code", 336 | "colab": { 337 | "base_uri": "https://localhost:8080/", 338 | "height": 116 339 | }, 340 | "outputId": "8342c342-2c54-488d-e780-eb882657cdac" 341 | }, 342 | "source": [ 343 | "optimizer = optim.Adam(rnn.parameters(), lr=1e-3)\n", 344 | "# BCEWithLogitsLoss是针对二分类的CrossEntropy\n", 345 | "criteon = nn.BCEWithLogitsLoss().to(device)\n", 346 | "rnn.to(device)" 347 | ], 348 | "execution_count": 20, 349 | "outputs": [ 350 | { 351 | "output_type": "execute_result", 352 | "data": { 353 | "text/plain": [ 354 | "RNN(\n", 355 | " (embedding): Embedding(10002, 100)\n", 356 | " (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)\n", 357 | " (fc): Linear(in_features=512, out_features=1, bias=True)\n", 358 | " (dropout): Dropout(p=0.5, inplace=False)\n", 359 | ")" 360 | ] 361 | }, 362 | "metadata": { 363 | "tags": [] 364 | }, 365 | "execution_count": 20 366 | } 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "metadata": { 372 | "id": "4UXwTad6CYlR", 373 | "colab_type": "code", 374 | "colab": {} 375 | }, 376 | "source": [ 377 | "import numpy as np \n", 378 | "def binary_acc(preds, y):\n", 379 | "\n", 380 | " preds = torch.round(torch.sigmoid(preds))\n", 381 | " correct = torch.eq(preds, y).float()\n", 382 | " acc = correct.sum() / len(correct)\n", 383 | " return acc" 384 | ], 385 | "execution_count": 27, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "yXMx5OVwCYjL", 392 | "colab_type": "code", 393 | "colab": {} 394 | }, 395 | "source": [ 396 | "def train(rnn, iterator, optimizer, criteon):\n", 397 | " avg_acc = []\n", 398 | " rnn.train() # 表示进入训练模式\n", 399 | "\n", 400 | " for i, batch in enumerate(iterator):\n", 401 | " # [seq, b] => [b, 1] => [b]\n", 402 | " # batch.text 就是上面forward函数的参数text,压缩维度是为了和batch.label维度一致\n", 403 | " pred = rnn(batch.text).squeeze(1)\n", 404 | "\n", 405 | " loss = criteon(pred, batch.label)\n", 406 | " # 计算每个batch的准确率\n", 407 | " acc = binary_acc(pred, batch.label).item()\n", 408 | " avg_acc.append(acc)\n", 409 | "\n", 410 | " optimizer.zero_grad() # 清零梯度准备计算\n", 411 | " loss.backward() # 反向传播\n", 412 | " optimizer.step() # 更新训练参数\n", 413 | "\n", 414 | " if i % 10 == 0:\n", 415 | " print(i, acc)\n", 416 | "\n", 417 | " avg_acc = np.array(avg_acc).mean()\n", 418 | " print('avg acc:', avg_acc)\n", 419 | "\n" 420 | ], 421 | "execution_count": 22, 422 | "outputs": [] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "metadata": { 427 | "id": "XRhMMq0rCYg7", 428 | "colab_type": "code", 429 | "colab": {} 430 | }, 431 | "source": [ 432 | "def evaluate(rnn, iterator, criteon):\n", 433 | " avg_acc = []\n", 434 | " rnn.eval() # 表示进入测试模式\n", 435 | "\n", 436 | " with torch.no_grad():\n", 437 | " for batch in iterator:\n", 438 | " pred = rnn(batch.text).squeeze(1) # [b, 1] => [b]\n", 439 | " loss = criteon(pred, batch.label)\n", 440 | " acc = binary_acc(pred, batch.label).item()\n", 441 | " avg_acc.append(acc)\n", 442 | "\n", 443 | " avg_acc = np.array(avg_acc).mean()\n", 444 | "\n", 445 | " print('test acc:', avg_acc)" 446 | ], 447 | "execution_count": 23, 448 | "outputs": [] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "metadata": { 453 | "id": "diKF7CfZCYd4", 454 | "colab_type": "code", 455 | "colab": { 456 | "base_uri": "https://localhost:8080/", 457 | "height": 1000 458 | }, 459 | "outputId": "8c356ec4-70b9-4a6f-88b9-23172fa0df0b" 460 | }, 461 | "source": [ 462 | "for epoch in range(10):\n", 463 | " \n", 464 | " train(rnn, train_iterator, optimizer, criteon)\n", 465 | " \n", 466 | " evaluate(rnn, test_iterator, criteon)" 467 | ], 468 | "execution_count": 28, 469 | "outputs": [ 470 | { 471 | "output_type": "stream", 472 | "text": [ 473 | "0 0.8666667342185974\n", 474 | "10 0.9666666984558105\n", 475 | "20 0.8000000715255737\n", 476 | "30 0.8666667342185974\n", 477 | "40 0.8666667342185974\n", 478 | "50 0.8000000715255737\n", 479 | "60 0.9333333969116211\n", 480 | "70 0.7666667103767395\n", 481 | "80 0.9000000357627869\n", 482 | "90 0.8666667342185974\n", 483 | "100 0.9000000357627869\n", 484 | "110 0.7666667103767395\n", 485 | "120 0.8000000715255737\n", 486 | "130 0.9666666984558105\n", 487 | "140 0.8666667342185974\n", 488 | "150 0.9000000357627869\n", 489 | "160 0.9000000357627869\n", 490 | "170 0.9000000357627869\n", 491 | "180 0.8000000715255737\n", 492 | "190 0.8000000715255737\n", 493 | "200 0.9333333969116211\n", 494 | "210 0.9000000357627869\n", 495 | "220 0.9333333969116211\n", 496 | "230 0.8666667342185974\n", 497 | "240 0.9000000357627869\n", 498 | "250 0.7666667103767395\n", 499 | "260 0.9333333969116211\n", 500 | "270 0.9000000357627869\n", 501 | "280 0.8000000715255737\n", 502 | "290 0.8666667342185974\n", 503 | "300 0.9333333969116211\n", 504 | "310 0.7666667103767395\n", 505 | "320 0.9000000357627869\n", 506 | "330 0.9666666984558105\n", 507 | "340 0.9666666984558105\n", 508 | "350 0.8333333730697632\n", 509 | "360 0.9000000357627869\n", 510 | "370 0.8000000715255737\n", 511 | "380 0.9000000357627869\n", 512 | "390 0.8666667342185974\n", 513 | "400 0.8333333730697632\n", 514 | "410 0.9000000357627869\n", 515 | "420 0.9333333969116211\n", 516 | "430 0.8333333730697632\n", 517 | "440 0.8666667342185974\n", 518 | "450 0.8000000715255737\n", 519 | "460 0.9333333969116211\n", 520 | "470 0.8666667342185974\n", 521 | "480 0.9333333969116211\n", 522 | "490 0.9333333969116211\n", 523 | "500 0.9000000357627869\n", 524 | "510 0.8333333730697632\n", 525 | "520 0.8666667342185974\n", 526 | "530 0.9333333969116211\n", 527 | "540 0.9333333969116211\n", 528 | "550 0.7666667103767395\n", 529 | "560 0.8333333730697632\n", 530 | "570 0.9333333969116211\n", 531 | "580 0.9000000357627869\n", 532 | "590 0.9333333969116211\n", 533 | "600 0.9000000357627869\n", 534 | "610 0.8333333730697632\n", 535 | "620 0.7333333492279053\n", 536 | "630 0.8333333730697632\n", 537 | "640 0.8333333730697632\n", 538 | "650 0.9000000357627869\n", 539 | "660 0.9333333969116211\n", 540 | "670 0.8000000715255737\n", 541 | "680 0.9000000357627869\n", 542 | "690 0.9000000357627869\n", 543 | "700 0.9000000357627869\n", 544 | "710 0.9333333969116211\n", 545 | "720 0.8000000715255737\n", 546 | "730 0.9333333969116211\n", 547 | "740 0.9666666984558105\n", 548 | "750 0.9666666984558105\n", 549 | "760 0.9333333969116211\n", 550 | "770 0.8666667342185974\n", 551 | "780 0.8666667342185974\n", 552 | "790 0.8666667342185974\n", 553 | "800 0.9666666984558105\n", 554 | "810 0.9000000357627869\n", 555 | "820 0.9000000357627869\n", 556 | "830 0.9333333969116211\n", 557 | "avg acc: 0.8855715916454078\n", 558 | "test acc: 0.8775779855051201\n", 559 | "0 0.9000000357627869\n", 560 | "10 0.9666666984558105\n", 561 | "20 0.9000000357627869\n", 562 | "30 0.9000000357627869\n", 563 | "40 0.9666666984558105\n", 564 | "50 0.9666666984558105\n", 565 | "60 0.7666667103767395\n", 566 | "70 0.8666667342185974\n", 567 | "80 0.9333333969116211\n", 568 | "90 0.9000000357627869\n", 569 | "100 0.9333333969116211\n", 570 | "110 0.8666667342185974\n", 571 | "120 0.9000000357627869\n", 572 | "130 0.9000000357627869\n", 573 | "140 0.8666667342185974\n", 574 | "150 0.8333333730697632\n", 575 | "160 0.8333333730697632\n", 576 | "170 0.9333333969116211\n", 577 | "180 0.8333333730697632\n", 578 | "190 0.9000000357627869\n", 579 | "200 0.8666667342185974\n", 580 | "210 1.0\n", 581 | "220 1.0\n", 582 | "230 0.9666666984558105\n", 583 | "240 0.9000000357627869\n", 584 | "250 0.8000000715255737\n", 585 | "260 0.9333333969116211\n", 586 | "270 0.9666666984558105\n", 587 | "280 0.9333333969116211\n", 588 | "290 0.9666666984558105\n", 589 | "300 0.9000000357627869\n", 590 | "310 0.9333333969116211\n", 591 | "320 0.9333333969116211\n", 592 | "330 0.9666666984558105\n", 593 | "340 0.9666666984558105\n", 594 | "350 0.9666666984558105\n", 595 | "360 0.9333333969116211\n", 596 | "370 0.9666666984558105\n", 597 | "380 0.8333333730697632\n", 598 | "390 0.7333333492279053\n", 599 | "400 0.9000000357627869\n", 600 | "410 0.9000000357627869\n", 601 | "420 0.8000000715255737\n", 602 | "430 0.9333333969116211\n", 603 | "440 0.8666667342185974\n", 604 | "450 0.9333333969116211\n", 605 | "460 0.8333333730697632\n", 606 | "470 0.9333333969116211\n", 607 | "480 0.9333333969116211\n", 608 | "490 0.8000000715255737\n", 609 | "500 0.9666666984558105\n", 610 | "510 0.9000000357627869\n", 611 | "520 1.0\n", 612 | "530 0.9666666984558105\n", 613 | "540 1.0\n", 614 | "550 0.9333333969116211\n", 615 | "560 0.9000000357627869\n", 616 | "570 1.0\n", 617 | "580 0.9000000357627869\n", 618 | "590 0.9000000357627869\n", 619 | "600 0.8666667342185974\n", 620 | "610 0.8333333730697632\n", 621 | "620 0.9000000357627869\n", 622 | "630 0.9000000357627869\n", 623 | "640 0.8666667342185974\n", 624 | "650 0.9000000357627869\n", 625 | "660 0.9666666984558105\n", 626 | "670 0.9333333969116211\n", 627 | "680 0.8666667342185974\n", 628 | "690 0.9000000357627869\n", 629 | "700 0.8666667342185974\n", 630 | "710 0.9333333969116211\n", 631 | "720 0.9666666984558105\n", 632 | "730 0.9666666984558105\n", 633 | "740 0.9666666984558105\n", 634 | "750 0.9000000357627869\n", 635 | "760 0.9000000357627869\n", 636 | "770 0.9000000357627869\n", 637 | "780 0.9333333969116211\n", 638 | "790 0.9333333969116211\n", 639 | "800 0.9333333969116211\n", 640 | "810 0.8666667342185974\n", 641 | "820 0.9000000357627869\n", 642 | "830 0.9000000357627869\n", 643 | "avg acc: 0.9071942910873633\n", 644 | "test acc: 0.8886890964542361\n", 645 | "0 0.9333333969116211\n", 646 | "10 0.9333333969116211\n", 647 | "20 0.9666666984558105\n", 648 | "30 0.9333333969116211\n", 649 | "40 0.9333333969116211\n", 650 | "50 0.8666667342185974\n", 651 | "60 1.0\n", 652 | "70 0.8333333730697632\n", 653 | "80 0.9666666984558105\n", 654 | "90 0.9000000357627869\n", 655 | "100 0.9666666984558105\n", 656 | "110 0.9666666984558105\n", 657 | "120 0.9333333969116211\n", 658 | "130 0.9333333969116211\n", 659 | "140 0.9000000357627869\n", 660 | "150 0.9666666984558105\n", 661 | "160 0.8666667342185974\n", 662 | "170 0.9666666984558105\n", 663 | "180 0.9666666984558105\n", 664 | "190 0.9333333969116211\n", 665 | "200 0.9333333969116211\n", 666 | "210 0.8666667342185974\n", 667 | "220 0.9000000357627869\n", 668 | "230 0.8333333730697632\n", 669 | "240 0.9333333969116211\n", 670 | "250 0.8000000715255737\n", 671 | "260 0.8666667342185974\n", 672 | "270 0.9000000357627869\n", 673 | "280 0.9000000357627869\n", 674 | "290 0.9666666984558105\n", 675 | "300 0.9333333969116211\n", 676 | "310 0.9000000357627869\n", 677 | "320 0.9333333969116211\n", 678 | "330 0.9666666984558105\n", 679 | "340 0.9000000357627869\n", 680 | "350 1.0\n", 681 | "360 0.9666666984558105\n", 682 | "370 0.9333333969116211\n", 683 | "380 0.9333333969116211\n", 684 | "390 0.9666666984558105\n", 685 | "400 0.9666666984558105\n", 686 | "410 0.9666666984558105\n", 687 | "420 1.0\n", 688 | "430 0.9000000357627869\n", 689 | "440 1.0\n", 690 | "450 0.9000000357627869\n", 691 | "460 0.9333333969116211\n", 692 | "470 1.0\n", 693 | "480 0.9000000357627869\n", 694 | "490 0.9333333969116211\n", 695 | "500 0.9000000357627869\n", 696 | "510 0.9000000357627869\n", 697 | "520 0.9333333969116211\n", 698 | "530 0.9333333969116211\n", 699 | "540 0.9666666984558105\n", 700 | "550 0.9666666984558105\n", 701 | "560 0.9666666984558105\n", 702 | "570 0.9666666984558105\n", 703 | "580 0.8333333730697632\n", 704 | "590 0.9666666984558105\n", 705 | "600 0.9333333969116211\n", 706 | "610 0.9333333969116211\n", 707 | "620 0.9333333969116211\n", 708 | "630 1.0\n", 709 | "640 0.9000000357627869\n", 710 | "650 0.8666667342185974\n", 711 | "660 0.9333333969116211\n", 712 | "670 0.8666667342185974\n", 713 | "680 0.9666666984558105\n", 714 | "690 0.9333333969116211\n", 715 | "700 1.0\n", 716 | "710 0.9666666984558105\n", 717 | "720 0.9666666984558105\n", 718 | "730 0.9000000357627869\n", 719 | "740 0.9333333969116211\n", 720 | "750 0.9666666984558105\n", 721 | "760 1.0\n", 722 | "770 0.8666667342185974\n", 723 | "780 0.9000000357627869\n", 724 | "790 0.9333333969116211\n", 725 | "800 0.9666666984558105\n", 726 | "810 0.9000000357627869\n", 727 | "820 0.9666666984558105\n", 728 | "830 0.8000000715255737\n", 729 | "avg acc: 0.9266587171337302\n", 730 | "test acc: 0.8872902161068768\n", 731 | "0 0.9333333969116211\n", 732 | "10 1.0\n", 733 | "20 1.0\n", 734 | "30 0.9666666984558105\n", 735 | "40 0.9666666984558105\n", 736 | "50 1.0\n", 737 | "60 0.9333333969116211\n", 738 | "70 0.9666666984558105\n", 739 | "80 0.8666667342185974\n", 740 | "90 0.9666666984558105\n", 741 | "100 0.9333333969116211\n", 742 | "110 0.8666667342185974\n", 743 | "120 0.9333333969116211\n", 744 | "130 0.9000000357627869\n", 745 | "140 0.8333333730697632\n", 746 | "150 0.9666666984558105\n", 747 | "160 0.9666666984558105\n", 748 | "170 0.8666667342185974\n", 749 | "180 0.9666666984558105\n", 750 | "190 0.9666666984558105\n", 751 | "200 0.9333333969116211\n", 752 | "210 0.9333333969116211\n", 753 | "220 0.9666666984558105\n", 754 | "230 0.9666666984558105\n", 755 | "240 0.9000000357627869\n", 756 | "250 1.0\n", 757 | "260 0.9333333969116211\n", 758 | "270 0.9666666984558105\n", 759 | "280 0.9333333969116211\n", 760 | "290 0.9000000357627869\n", 761 | "300 1.0\n", 762 | "310 0.9333333969116211\n", 763 | "320 0.9666666984558105\n", 764 | "330 0.9666666984558105\n", 765 | "340 0.9333333969116211\n", 766 | "350 0.9333333969116211\n", 767 | "360 0.9333333969116211\n", 768 | "370 0.9333333969116211\n", 769 | "380 1.0\n", 770 | "390 1.0\n", 771 | "400 0.9333333969116211\n", 772 | "410 1.0\n", 773 | "420 0.9333333969116211\n", 774 | "430 0.9666666984558105\n", 775 | "440 0.9333333969116211\n", 776 | "450 0.9333333969116211\n", 777 | "460 0.9666666984558105\n", 778 | "470 0.8333333730697632\n", 779 | "480 1.0\n", 780 | "490 0.9333333969116211\n", 781 | "500 0.9666666984558105\n", 782 | "510 0.9000000357627869\n", 783 | "520 0.9000000357627869\n", 784 | "530 1.0\n", 785 | "540 0.9333333969116211\n", 786 | "550 0.9666666984558105\n", 787 | "560 0.9000000357627869\n", 788 | "570 0.9333333969116211\n", 789 | "580 0.9333333969116211\n", 790 | "590 0.9666666984558105\n", 791 | "600 0.8333333730697632\n", 792 | "610 0.9333333969116211\n", 793 | "620 0.8666667342185974\n", 794 | "630 0.9000000357627869\n", 795 | "640 0.9333333969116211\n", 796 | "650 0.9666666984558105\n", 797 | "660 0.9666666984558105\n", 798 | "670 0.9333333969116211\n", 799 | "680 0.9333333969116211\n", 800 | "690 0.9333333969116211\n", 801 | "700 0.9666666984558105\n", 802 | "710 0.9000000357627869\n", 803 | "720 0.9333333969116211\n", 804 | "730 1.0\n", 805 | "740 0.9666666984558105\n", 806 | "750 0.9333333969116211\n", 807 | "760 0.9666666984558105\n", 808 | "770 0.8333333730697632\n", 809 | "780 0.9666666984558105\n", 810 | "790 0.9000000357627869\n", 811 | "800 0.9000000357627869\n", 812 | "810 0.9000000357627869\n", 813 | "820 0.9666666984558105\n", 814 | "830 0.9666666984558105\n", 815 | "avg acc: 0.9356515197445163\n", 816 | "test acc: 0.890008042184569\n", 817 | "0 1.0\n", 818 | "10 1.0\n", 819 | "20 0.9000000357627869\n", 820 | "30 0.8666667342185974\n", 821 | "40 0.9000000357627869\n", 822 | "50 0.9333333969116211\n", 823 | "60 0.9000000357627869\n", 824 | "70 0.9666666984558105\n", 825 | "80 0.8666667342185974\n", 826 | "90 0.9000000357627869\n", 827 | "100 0.9333333969116211\n", 828 | "110 1.0\n", 829 | "120 0.9666666984558105\n", 830 | "130 0.9666666984558105\n", 831 | "140 1.0\n", 832 | "150 0.9333333969116211\n", 833 | "160 0.9333333969116211\n", 834 | "170 0.9333333969116211\n", 835 | "180 1.0\n", 836 | "190 0.9666666984558105\n", 837 | "200 0.9333333969116211\n", 838 | "210 1.0\n", 839 | "220 0.9666666984558105\n", 840 | "230 1.0\n", 841 | "240 0.9333333969116211\n", 842 | "250 0.8333333730697632\n", 843 | "260 0.9666666984558105\n", 844 | "270 0.9333333969116211\n", 845 | "280 0.9000000357627869\n", 846 | "290 1.0\n", 847 | "300 0.9666666984558105\n", 848 | "310 0.9333333969116211\n", 849 | "320 0.9000000357627869\n", 850 | "330 0.9000000357627869\n", 851 | "340 1.0\n", 852 | "350 0.9666666984558105\n", 853 | "360 1.0\n", 854 | "370 0.9666666984558105\n", 855 | "380 0.9000000357627869\n", 856 | "390 0.9666666984558105\n", 857 | "400 0.9666666984558105\n", 858 | "410 0.9333333969116211\n", 859 | "420 0.9000000357627869\n", 860 | "430 1.0\n", 861 | "440 0.9333333969116211\n", 862 | "450 0.9666666984558105\n", 863 | "460 0.9666666984558105\n", 864 | "470 1.0\n", 865 | "480 1.0\n", 866 | "490 0.9666666984558105\n", 867 | "500 1.0\n", 868 | "510 1.0\n", 869 | "520 1.0\n", 870 | "530 1.0\n", 871 | "540 0.8666667342185974\n", 872 | "550 1.0\n", 873 | "560 0.9333333969116211\n", 874 | "570 0.9333333969116211\n", 875 | "580 0.9666666984558105\n", 876 | "590 0.9666666984558105\n", 877 | "600 0.9333333969116211\n", 878 | "610 0.9000000357627869\n", 879 | "620 0.9333333969116211\n", 880 | "630 0.9666666984558105\n", 881 | "640 0.9666666984558105\n", 882 | "650 0.9333333969116211\n", 883 | "660 0.9333333969116211\n", 884 | "670 0.9000000357627869\n", 885 | "680 0.9333333969116211\n", 886 | "690 0.9000000357627869\n", 887 | "700 0.9333333969116211\n", 888 | "710 0.9666666984558105\n", 889 | "720 0.9666666984558105\n", 890 | "730 0.9333333969116211\n", 891 | "740 0.9333333969116211\n", 892 | "750 1.0\n", 893 | "760 0.9666666984558105\n", 894 | "770 0.9333333969116211\n", 895 | "780 0.9333333969116211\n", 896 | "790 0.9000000357627869\n", 897 | "800 1.0\n", 898 | "810 0.9000000357627869\n", 899 | "820 1.0\n", 900 | "830 0.9000000357627869\n", 901 | "avg acc: 0.9450040338136595\n", 902 | "test acc: 0.8848521674422624\n", 903 | "0 1.0\n", 904 | "10 1.0\n", 905 | "20 0.9666666984558105\n", 906 | "30 0.9666666984558105\n", 907 | "40 1.0\n", 908 | "50 1.0\n", 909 | "60 0.9666666984558105\n", 910 | "70 1.0\n", 911 | "80 0.9666666984558105\n", 912 | "100 0.9666666984558105\n", 913 | "110 0.9666666984558105\n", 914 | "120 0.9333333969116211\n", 915 | "130 0.9666666984558105\n", 916 | "140 0.9666666984558105\n", 917 | "150 1.0\n", 918 | "160 0.9666666984558105\n", 919 | "170 1.0\n", 920 | "180 1.0\n", 921 | "190 0.9666666984558105\n", 922 | "200 0.8666667342185974\n", 923 | "210 1.0\n", 924 | "220 0.8666667342185974\n", 925 | "230 0.9666666984558105\n", 926 | "240 0.9333333969116211\n", 927 | "250 0.8333333730697632\n", 928 | "260 0.9666666984558105\n", 929 | "270 0.9666666984558105\n", 930 | "280 0.9000000357627869\n", 931 | "290 0.9666666984558105\n", 932 | "300 0.9666666984558105\n", 933 | "310 0.9333333969116211\n", 934 | "320 1.0\n", 935 | "330 0.9666666984558105\n", 936 | "340 0.9666666984558105\n", 937 | "350 0.9333333969116211\n", 938 | "360 0.9000000357627869\n", 939 | "370 0.8666667342185974\n", 940 | "380 0.9333333969116211\n", 941 | "390 0.8333333730697632\n", 942 | "400 0.9666666984558105\n", 943 | "410 1.0\n", 944 | "420 0.9666666984558105\n", 945 | "430 0.9666666984558105\n", 946 | "440 1.0\n", 947 | "450 0.9666666984558105\n", 948 | "460 0.9333333969116211\n", 949 | "470 1.0\n", 950 | "480 0.9666666984558105\n", 951 | "490 1.0\n", 952 | "500 0.9666666984558105\n", 953 | "510 0.9333333969116211\n", 954 | "520 0.8666667342185974\n", 955 | "530 0.9666666984558105\n", 956 | "540 1.0\n", 957 | "550 1.0\n", 958 | "560 0.9333333969116211\n", 959 | "570 0.9333333969116211\n", 960 | "580 1.0\n", 961 | "590 0.9666666984558105\n", 962 | "600 0.9666666984558105\n", 963 | "610 0.9666666984558105\n", 964 | "620 0.9666666984558105\n", 965 | "630 0.9666666984558105\n", 966 | "640 0.9333333969116211\n", 967 | "650 0.9000000357627869\n", 968 | "660 0.9333333969116211\n", 969 | "670 1.0\n", 970 | "680 0.9333333969116211\n", 971 | "690 0.9666666984558105\n", 972 | "700 0.9333333969116211\n", 973 | "710 1.0\n", 974 | "720 0.9333333969116211\n", 975 | "730 1.0\n", 976 | "740 0.9666666984558105\n", 977 | "750 0.9666666984558105\n", 978 | "760 0.8666667342185974\n", 979 | "770 0.9000000357627869\n", 980 | "780 0.8000000715255737\n", 981 | "790 0.9666666984558105\n", 982 | "800 0.9666666984558105\n", 983 | "810 0.8666667342185974\n", 984 | "820 1.0\n", 985 | "830 0.9666666984558105\n", 986 | "avg acc: 0.9509592677334802\n", 987 | "test acc: 0.8718625588668621\n", 988 | "0 1.0\n", 989 | "10 1.0\n", 990 | "20 0.9666666984558105\n", 991 | "30 0.9333333969116211\n", 992 | "40 1.0\n", 993 | "50 0.9666666984558105\n", 994 | "60 0.9666666984558105\n", 995 | "70 0.9666666984558105\n", 996 | "80 1.0\n", 997 | "90 0.9333333969116211\n", 998 | "100 1.0\n", 999 | "110 0.9666666984558105\n", 1000 | "120 0.9666666984558105\n", 1001 | "130 0.9666666984558105\n", 1002 | "140 0.9666666984558105\n", 1003 | "150 0.9666666984558105\n", 1004 | "160 0.9666666984558105\n", 1005 | "170 1.0\n", 1006 | "180 0.9666666984558105\n", 1007 | "190 0.9000000357627869\n", 1008 | "200 1.0\n", 1009 | "210 1.0\n", 1010 | "220 0.9333333969116211\n", 1011 | "230 1.0\n", 1012 | "240 0.9666666984558105\n", 1013 | "250 1.0\n", 1014 | "260 0.9666666984558105\n", 1015 | "270 0.9666666984558105\n", 1016 | "280 0.9333333969116211\n", 1017 | "290 0.9333333969116211\n", 1018 | "300 0.9666666984558105\n", 1019 | "310 0.9666666984558105\n", 1020 | "320 0.9666666984558105\n", 1021 | "330 0.9333333969116211\n", 1022 | "340 1.0\n", 1023 | "350 0.9333333969116211\n", 1024 | "360 0.9666666984558105\n", 1025 | "370 0.9333333969116211\n", 1026 | "380 0.9666666984558105\n", 1027 | "390 0.9333333969116211\n", 1028 | "400 0.9666666984558105\n", 1029 | "410 0.9666666984558105\n", 1030 | "420 0.9666666984558105\n", 1031 | "430 0.9333333969116211\n", 1032 | "440 0.9333333969116211\n", 1033 | "450 0.9666666984558105\n", 1034 | "460 1.0\n", 1035 | "470 1.0\n", 1036 | "480 0.9666666984558105\n", 1037 | "490 0.9333333969116211\n", 1038 | "500 0.9666666984558105\n", 1039 | "510 0.9333333969116211\n", 1040 | "520 0.9666666984558105\n", 1041 | "530 0.9666666984558105\n", 1042 | "540 1.0\n", 1043 | "550 0.9666666984558105\n", 1044 | "560 0.9333333969116211\n", 1045 | "570 1.0\n", 1046 | "580 0.9666666984558105\n", 1047 | "590 0.9666666984558105\n", 1048 | "600 1.0\n", 1049 | "610 0.9000000357627869\n", 1050 | "620 0.9333333969116211\n", 1051 | "630 0.9333333969116211\n", 1052 | "640 0.9333333969116211\n", 1053 | "650 0.9666666984558105\n", 1054 | "660 0.9000000357627869\n", 1055 | "670 0.9000000357627869\n", 1056 | "680 1.0\n", 1057 | "690 0.9333333969116211\n", 1058 | "700 0.9666666984558105\n", 1059 | "710 0.8000000715255737\n", 1060 | "720 0.9333333969116211\n", 1061 | "730 0.8666667342185974\n", 1062 | "740 0.9333333969116211\n", 1063 | "750 0.9666666984558105\n", 1064 | "760 1.0\n", 1065 | "770 0.9333333969116211\n", 1066 | "780 0.9000000357627869\n", 1067 | "790 0.9666666984558105\n", 1068 | "800 0.9333333969116211\n", 1069 | "810 0.8666667342185974\n", 1070 | "820 0.9000000357627869\n", 1071 | "830 0.9666666984558105\n", 1072 | "avg acc: 0.9605116213111283\n", 1073 | "test acc: 0.8822142779827118\n", 1074 | "0 0.9666666984558105\n", 1075 | "10 0.9666666984558105\n", 1076 | "20 1.0\n", 1077 | "30 0.9666666984558105\n", 1078 | "40 1.0\n", 1079 | "50 0.9666666984558105\n", 1080 | "60 1.0\n", 1081 | "70 0.9000000357627869\n", 1082 | "80 1.0\n", 1083 | "90 0.9666666984558105\n", 1084 | "100 0.9333333969116211\n", 1085 | "110 1.0\n", 1086 | "120 1.0\n", 1087 | "130 0.9666666984558105\n", 1088 | "140 0.9666666984558105\n", 1089 | "150 1.0\n", 1090 | "160 0.9666666984558105\n", 1091 | "170 0.9333333969116211\n", 1092 | "180 0.9666666984558105\n", 1093 | "190 0.9333333969116211\n", 1094 | "200 0.9666666984558105\n", 1095 | "210 1.0\n", 1096 | "220 0.9666666984558105\n", 1097 | "230 1.0\n", 1098 | "240 0.9666666984558105\n", 1099 | "250 1.0\n", 1100 | "260 0.9333333969116211\n", 1101 | "270 0.9666666984558105\n", 1102 | "280 0.9000000357627869\n", 1103 | "290 1.0\n", 1104 | "300 0.9333333969116211\n", 1105 | "310 0.9666666984558105\n", 1106 | "320 0.9666666984558105\n", 1107 | "330 0.9333333969116211\n", 1108 | "340 1.0\n", 1109 | "350 0.9333333969116211\n", 1110 | "360 0.9666666984558105\n", 1111 | "370 1.0\n", 1112 | "380 1.0\n", 1113 | "390 0.9000000357627869\n", 1114 | "400 1.0\n", 1115 | "410 1.0\n", 1116 | "420 1.0\n", 1117 | "430 1.0\n", 1118 | "440 1.0\n", 1119 | "450 0.9666666984558105\n", 1120 | "460 0.9000000357627869\n", 1121 | "470 1.0\n", 1122 | "480 1.0\n", 1123 | "490 0.8666667342185974\n", 1124 | "500 1.0\n", 1125 | "510 1.0\n", 1126 | "520 1.0\n", 1127 | "530 0.9666666984558105\n", 1128 | "540 0.9000000357627869\n", 1129 | "550 1.0\n", 1130 | "560 0.9333333969116211\n", 1131 | "570 0.9666666984558105\n", 1132 | "580 1.0\n", 1133 | "590 0.9666666984558105\n", 1134 | "600 0.9333333969116211\n", 1135 | "610 0.9666666984558105\n", 1136 | "620 0.9666666984558105\n", 1137 | "630 1.0\n", 1138 | "640 0.9000000357627869\n", 1139 | "650 0.9666666984558105\n", 1140 | "660 1.0\n", 1141 | "670 0.9000000357627869\n", 1142 | "680 0.9333333969116211\n", 1143 | "690 1.0\n", 1144 | "700 1.0\n", 1145 | "710 1.0\n", 1146 | "720 0.9666666984558105\n", 1147 | "730 1.0\n", 1148 | "740 1.0\n", 1149 | "750 1.0\n", 1150 | "760 1.0\n", 1151 | "770 0.8666667342185974\n", 1152 | "780 0.9666666984558105\n", 1153 | "790 0.9333333969116211\n", 1154 | "800 0.9666666984558105\n", 1155 | "810 1.0\n", 1156 | "820 1.0\n", 1157 | "830 0.9666666984558105\n", 1158 | "avg acc: 0.9653077817363419\n", 1159 | "test acc: 0.8769784666222634\n", 1160 | "0 1.0\n", 1161 | "10 0.9666666984558105\n", 1162 | "20 1.0\n", 1163 | "30 0.9333333969116211\n", 1164 | "40 1.0\n", 1165 | "50 1.0\n", 1166 | "60 1.0\n", 1167 | "70 1.0\n", 1168 | "80 0.9666666984558105\n", 1169 | "90 0.9333333969116211\n", 1170 | "100 0.9666666984558105\n", 1171 | "110 0.9666666984558105\n", 1172 | "120 1.0\n", 1173 | "130 0.9666666984558105\n", 1174 | "140 1.0\n", 1175 | "150 1.0\n", 1176 | "160 0.9666666984558105\n", 1177 | "170 1.0\n", 1178 | "180 0.9333333969116211\n", 1179 | "190 1.0\n", 1180 | "200 0.9666666984558105\n", 1181 | "210 1.0\n", 1182 | "220 0.8333333730697632\n", 1183 | "230 1.0\n", 1184 | "240 1.0\n", 1185 | "250 0.9666666984558105\n", 1186 | "260 0.9666666984558105\n", 1187 | "270 0.9000000357627869\n", 1188 | "280 0.9666666984558105\n", 1189 | "290 0.9333333969116211\n", 1190 | "300 0.9666666984558105\n", 1191 | "310 0.9666666984558105\n", 1192 | "320 0.9333333969116211\n", 1193 | "330 1.0\n", 1194 | "340 1.0\n", 1195 | "350 0.9333333969116211\n", 1196 | "360 0.9666666984558105\n", 1197 | "370 0.9666666984558105\n", 1198 | "380 0.9666666984558105\n", 1199 | "390 1.0\n", 1200 | "400 0.9333333969116211\n", 1201 | "410 0.9333333969116211\n", 1202 | "420 1.0\n", 1203 | "430 0.9666666984558105\n", 1204 | "440 0.9666666984558105\n", 1205 | "450 0.9333333969116211\n", 1206 | "460 1.0\n", 1207 | "470 0.9666666984558105\n", 1208 | "480 1.0\n", 1209 | "490 1.0\n", 1210 | "500 0.9333333969116211\n", 1211 | "510 0.9666666984558105\n", 1212 | "520 1.0\n", 1213 | "530 0.9333333969116211\n", 1214 | "540 0.9666666984558105\n", 1215 | "550 0.9333333969116211\n", 1216 | "560 0.9333333969116211\n", 1217 | "570 0.9333333969116211\n", 1218 | "580 1.0\n", 1219 | "590 1.0\n", 1220 | "600 0.9333333969116211\n", 1221 | "610 0.9666666984558105\n", 1222 | "620 1.0\n", 1223 | "630 1.0\n", 1224 | "640 1.0\n", 1225 | "650 0.9666666984558105\n", 1226 | "660 1.0\n", 1227 | "670 1.0\n", 1228 | "680 1.0\n", 1229 | "690 0.9333333969116211\n", 1230 | "700 1.0\n", 1231 | "710 0.9333333969116211\n", 1232 | "720 1.0\n", 1233 | "730 1.0\n", 1234 | "740 0.9666666984558105\n", 1235 | "750 0.9000000357627869\n", 1236 | "760 0.9000000357627869\n", 1237 | "770 0.9333333969116211\n", 1238 | "780 0.9666666984558105\n", 1239 | "790 1.0\n", 1240 | "800 1.0\n", 1241 | "810 0.9666666984558105\n", 1242 | "820 0.9666666984558105\n", 1243 | "830 1.0\n", 1244 | "avg acc: 0.9697442299885144\n", 1245 | "test acc: 0.8815348212667506\n", 1246 | "0 0.9666666984558105\n", 1247 | "10 0.9666666984558105\n", 1248 | "20 0.9666666984558105\n", 1249 | "30 0.9666666984558105\n", 1250 | "40 0.9666666984558105\n", 1251 | "50 0.8666667342185974\n", 1252 | "60 1.0\n", 1253 | "70 1.0\n", 1254 | "80 1.0\n", 1255 | "90 1.0\n", 1256 | "100 1.0\n", 1257 | "110 0.9666666984558105\n", 1258 | "120 1.0\n", 1259 | "130 1.0\n", 1260 | "140 1.0\n", 1261 | "150 0.9666666984558105\n", 1262 | "160 1.0\n", 1263 | "170 0.9333333969116211\n", 1264 | "180 1.0\n", 1265 | "190 0.9000000357627869\n", 1266 | "200 1.0\n", 1267 | "210 0.8666667342185974\n", 1268 | "220 1.0\n", 1269 | "230 1.0\n", 1270 | "240 1.0\n", 1271 | "250 0.9000000357627869\n", 1272 | "260 1.0\n", 1273 | "270 1.0\n", 1274 | "280 0.9666666984558105\n", 1275 | "290 0.9666666984558105\n", 1276 | "300 0.9666666984558105\n", 1277 | "310 0.9666666984558105\n", 1278 | "320 0.9666666984558105\n", 1279 | "330 1.0\n", 1280 | "340 1.0\n", 1281 | "350 0.9333333969116211\n", 1282 | "360 0.9666666984558105\n", 1283 | "370 1.0\n", 1284 | "380 0.9666666984558105\n", 1285 | "390 1.0\n", 1286 | "400 0.9666666984558105\n", 1287 | "410 1.0\n", 1288 | "420 1.0\n", 1289 | "430 1.0\n", 1290 | "440 1.0\n", 1291 | "450 1.0\n", 1292 | "460 1.0\n", 1293 | "470 1.0\n", 1294 | "480 0.9666666984558105\n", 1295 | "490 1.0\n", 1296 | "500 1.0\n", 1297 | "510 1.0\n", 1298 | "520 0.9666666984558105\n", 1299 | "530 0.9666666984558105\n", 1300 | "540 0.9666666984558105\n", 1301 | "550 0.9000000357627869\n", 1302 | "560 0.9000000357627869\n", 1303 | "570 0.9666666984558105\n", 1304 | "580 1.0\n", 1305 | "590 0.9666666984558105\n", 1306 | "600 1.0\n", 1307 | "610 0.9666666984558105\n", 1308 | "620 1.0\n", 1309 | "630 0.9666666984558105\n", 1310 | "640 0.9666666984558105\n", 1311 | "650 1.0\n", 1312 | "660 0.9666666984558105\n", 1313 | "670 1.0\n", 1314 | "680 0.9666666984558105\n", 1315 | "690 0.9666666984558105\n", 1316 | "700 0.9666666984558105\n", 1317 | "710 1.0\n", 1318 | "720 0.8666667342185974\n", 1319 | "730 1.0\n", 1320 | "740 0.9666666984558105\n", 1321 | "750 0.9333333969116211\n", 1322 | "760 1.0\n", 1323 | "770 0.9666666984558105\n", 1324 | "780 1.0\n", 1325 | "790 0.9666666984558105\n", 1326 | "800 1.0\n", 1327 | "810 1.0\n", 1328 | "820 1.0\n", 1329 | "830 0.9333333969116211\n", 1330 | "avg acc: 0.9726618941453435\n", 1331 | "test acc: 0.8754996503714463\n" 1332 | ], 1333 | "name": "stdout" 1334 | } 1335 | ] 1336 | }, 1337 | { 1338 | "cell_type": "code", 1339 | "metadata": { 1340 | "id": "S6Gzb78B8B1y", 1341 | "colab_type": "code", 1342 | "colab": {} 1343 | }, 1344 | "source": [ 1345 | "def predice_test(x):\n", 1346 | "\n", 1347 | " preds = torch.round(torch.sigmoid(x))\n", 1348 | " return preds" 1349 | ], 1350 | "execution_count": 38, 1351 | "outputs": [] 1352 | }, 1353 | { 1354 | "cell_type": "code", 1355 | "metadata": { 1356 | "id": "PXxZTxQY8BzW", 1357 | "colab_type": "code", 1358 | "colab": { 1359 | "base_uri": "https://localhost:8080/", 1360 | "height": 100 1361 | }, 1362 | "outputId": "38c7e7d5-3710-47f2-8c44-f59918a77708" 1363 | }, 1364 | "source": [ 1365 | "for batch in test_iterator:\n", 1366 | " pred = rnn(batch.text).squeeze(1)\n", 1367 | " pred = predice_test(pred)\n", 1368 | " print(pred)\n", 1369 | " print(batch.label)\n", 1370 | " break\n", 1371 | "\n" 1372 | ], 1373 | "execution_count": 39, 1374 | "outputs": [ 1375 | { 1376 | "output_type": "stream", 1377 | "text": [ 1378 | "tensor([1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1.,\n", 1379 | " 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0.], device='cuda:0',\n", 1380 | " grad_fn=)\n", 1381 | "tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1.,\n", 1382 | " 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0.], device='cuda:0')\n" 1383 | ], 1384 | "name": "stdout" 1385 | } 1386 | ] 1387 | }, 1388 | { 1389 | "cell_type": "code", 1390 | "metadata": { 1391 | "id": "VT2nZTLjb5nW", 1392 | "colab_type": "code", 1393 | "colab": {} 1394 | }, 1395 | "source": [ 1396 | "" 1397 | ], 1398 | "execution_count": null, 1399 | "outputs": [] 1400 | } 1401 | ] 1402 | } -------------------------------------------------------------------------------- /2.使用Bert预训练模型微调中文文本分类.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Pytorch-使用Bert预训练模型微调中文文本分类.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyP5eyOdl/S2VQKOeAW1zZ87", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "S9q5Kcu1yQPY", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "语料链接:https://pan.baidu.com/s/1YxGGYmeByuAlRdAVov_ZLg\n", 37 | "提取码:tzao\n", 38 | "\n", 39 | "neg.txt和pos.txt各5000条酒店评论,每条评论一行。" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "SoH9VrHLyFiY", 46 | "colab_type": "code", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/", 49 | "height": 359 50 | }, 51 | "outputId": "b10075de-6ae3-4620-ab37-19de8bff3254" 52 | }, 53 | "source": [ 54 | "!pip install transformers" 55 | ], 56 | "execution_count": 26, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "text": [ 61 | "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (3.0.2)\n", 62 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", 63 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", 64 | "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.43)\n", 65 | "Requirement already satisfied: tokenizers==0.8.1.rc1 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8.1rc1)\n", 66 | "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", 67 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", 68 | "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)\n", 69 | "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n", 70 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)\n", 71 | "Requirement already satisfied: sentencepiece!=0.1.92 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.91)\n", 72 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n", 73 | "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", 74 | "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)\n", 75 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", 76 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)\n", 77 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n", 78 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", 79 | "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n" 80 | ], 81 | "name": "stdout" 82 | } 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "ry0PkCK2yerw", 89 | "colab_type": "code", 90 | "colab": { 91 | "base_uri": "https://localhost:8080/", 92 | "height": 35 93 | }, 94 | "outputId": "c424b73f-0b46-4cf1-ad81-60ff1732111a" 95 | }, 96 | "source": [ 97 | "import numpy as np\n", 98 | "import random\n", 99 | "import torch\n", 100 | "import matplotlib.pylab as plt \n", 101 | "from torch.nn.utils import clip_grad_norm_\n", 102 | "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", 103 | "from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n", 104 | "from transformers import get_linear_schedule_with_warmup\n", 105 | "\n", 106 | "SEED = 123\n", 107 | "BATCH_SIZE = 16\n", 108 | "learning_rate = 2e-5\n", 109 | "weight_decay = 1e-2 # 0.01\n", 110 | "epsilon = 1e-8\n", 111 | "\n", 112 | "random.seed(SEED)\n", 113 | "np.random.seed(SEED)\n", 114 | "torch.manual_seed(SEED)" 115 | ], 116 | "execution_count": 27, 117 | "outputs": [ 118 | { 119 | "output_type": "execute_result", 120 | "data": { 121 | "text/plain": [ 122 | "" 123 | ] 124 | }, 125 | "metadata": { 126 | "tags": [] 127 | }, 128 | "execution_count": 27 129 | } 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "6gp4_kgv_MEb", 136 | "colab_type": "text" 137 | }, 138 | "source": [ 139 | "# 1. 数据预处理\n", 140 | "\n", 141 | "## 1.1 读取文件\n" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "id": "q6-4feGA2OtC", 148 | "colab_type": "code", 149 | "colab": {} 150 | }, 151 | "source": [ 152 | "def readFile(filename):\n", 153 | " with open(filename, encoding='utf-8') as f:\n", 154 | " content = f.readlines()\n", 155 | " return content\n", 156 | "\n", 157 | "pos_text, neg_text = readFile('./sample_data/pos.txt'), readFile('./sample_data/neg.txt')\n", 158 | "sentences = pos_text + neg_text\n", 159 | "\n", 160 | "# 设定标签\n", 161 | "pos_targets = np.ones([len(pos_text)]) # (5000, )\n", 162 | "neg_targets = np.zeros([len(neg_text)]) # (5000, )\n", 163 | "targets = np.concatenate((pos_targets, neg_targets), axis=0).reshape(-1, 1) # (10000, 1)\n", 164 | "total_targets = torch.tensor(targets)" 165 | ], 166 | "execution_count": 28, 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "id": "IqtLQtD5_pXN", 173 | "colab_type": "text" 174 | }, 175 | "source": [ 176 | "## 1.2 BertTokenizer进行编码,将每一句转成数字" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "metadata": { 182 | "id": "OViCya5Q2Q2j", 183 | "colab_type": "code", 184 | "colab": { 185 | "base_uri": "https://localhost:8080/", 186 | "height": 107 187 | }, 188 | "outputId": "40548311-d5fa-4824-c1fd-d6fe0350fefa" 189 | }, 190 | "source": [ 191 | "model_name = 'bert-base-chinese'\n", 192 | "cache_dir = './sample_data/'\n", 193 | "\n", 194 | "tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)\n", 195 | "print(pos_text[2])\n", 196 | "print(tokenizer.tokenize(pos_text[2]))\n", 197 | "print(tokenizer.encode(pos_text[2]))\n", 198 | "print(tokenizer.convert_ids_to_tokens(tokenizer.encode(pos_text[2])))" 199 | ], 200 | "execution_count": 29, 201 | "outputs": [ 202 | { 203 | "output_type": "stream", 204 | "text": [ 205 | "不错,下次还考虑入住。交通也方便,在餐厅吃的也不错。\n", 206 | "\n", 207 | "['不', '错', ',', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', ',', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。']\n", 208 | "[101, 679, 7231, 8024, 678, 3613, 6820, 5440, 5991, 1057, 857, 511, 769, 6858, 738, 3175, 912, 8024, 1762, 7623, 1324, 1391, 4638, 738, 679, 7231, 511, 102]\n", 209 | "['[CLS]', '不', '错', ',', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', ',', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。', '[SEP]']\n" 210 | ], 211 | "name": "stdout" 212 | } 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "metadata": { 218 | "id": "-5ExQ8ji2Q8f", 219 | "colab_type": "code", 220 | "colab": { 221 | "base_uri": "https://localhost:8080/", 222 | "height": 35 223 | }, 224 | "outputId": "20255e57-078d-4d58-a707-f79300d8a0ac" 225 | }, 226 | "source": [ 227 | "# 将每一句转成数字 (大于126做截断,小于126做 Padding,加上首位两个标识,长度总共等于128)\n", 228 | "def convert_text_to_token(tokenizer, sentence, limit_size = 126):\n", 229 | " tokens = tokenizer.encode(sentence[:limit_size]) # 直接截断\n", 230 | " if len(tokens) < limit_size + 2: # 补齐(pad的索引号就是0)\n", 231 | " tokens.extend([0] * (limit_size + 2 - len(tokens)))\n", 232 | " return tokens\n", 233 | "\n", 234 | "input_ids = [convert_text_to_token(tokenizer, sen) for sen in sentences]\n", 235 | "\n", 236 | "input_tokens = torch.tensor(input_ids)\n", 237 | "print(input_tokens.shape)" 238 | ], 239 | "execution_count": 30, 240 | "outputs": [ 241 | { 242 | "output_type": "stream", 243 | "text": [ 244 | "torch.Size([10000, 128])\n" 245 | ], 246 | "name": "stdout" 247 | } 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "qeYRZQqRTVLS", 254 | "colab_type": "text" 255 | }, 256 | "source": [ 257 | "## 1.3 attention_masks, 在一个文本中,如果是PAD符号则是0,否则就是1" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "xX2hM7hW2RFp", 264 | "colab_type": "code", 265 | "colab": { 266 | "base_uri": "https://localhost:8080/", 267 | "height": 35 268 | }, 269 | "outputId": "8d6b2a04-bf63-4c11-fa7a-ea14f842dc11" 270 | }, 271 | "source": [ 272 | "# 建立mask\n", 273 | "def attention_masks(input_ids):\n", 274 | " atten_masks = []\n", 275 | " for seq in input_ids: # [10000, 128]\n", 276 | " seq_mask = [float(i > 0) for i in seq] # PAD: 0; 否则: 1\n", 277 | " atten_masks.append(seq_mask)\n", 278 | " return atten_masks\n", 279 | "\n", 280 | "atten_masks = attention_masks(input_ids)\n", 281 | "attention_tokens = torch.tensor(atten_masks)\n", 282 | "print(attention_tokens.shape)" 283 | ], 284 | "execution_count": 31, 285 | "outputs": [ 286 | { 287 | "output_type": "stream", 288 | "text": [ 289 | "torch.Size([10000, 128])\n" 290 | ], 291 | "name": "stdout" 292 | } 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "-yc1nwTdXC50", 299 | "colab_type": "text" 300 | }, 301 | "source": [ 302 | "- 构造input_ids和atten_masks的目的和前面一节中提到的.encode_plus函数返回的input_ids和attention_mask一样\n", 303 | "\n", 304 | "- input_type_ids和本次任务无关,它是针对每个训练集有两个句子的任务(如问答任务)。" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "metadata": { 310 | "id": "ottvMKol2RLE", 311 | "colab_type": "code", 312 | "colab": { 313 | "base_uri": "https://localhost:8080/", 314 | "height": 431 315 | }, 316 | "outputId": "fcec2cae-960b-4b63-d8fa-3720bf5ac75d" 317 | }, 318 | "source": [ 319 | "from sklearn.model_selection import train_test_split\n", 320 | "\n", 321 | "train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens, total_targets, \n", 322 | " random_state=666, test_size=0.2)\n", 323 | "train_masks, test_masks, _, _ = train_test_split(attention_tokens, input_tokens, \n", 324 | " random_state=666, test_size=0.2)\n", 325 | "print(train_inputs.shape, test_inputs.shape) # torch.Size([8000, 128]) torch.Size([2000, 128])\n", 326 | "print(train_masks.shape) # torch.Size([8000, 128]) 和 train_inputs形状一样\n", 327 | "\n", 328 | "print(train_inputs[0])\n", 329 | "print(train_masks[0])" 330 | ], 331 | "execution_count": 32, 332 | "outputs": [ 333 | { 334 | "output_type": "stream", 335 | "text": [ 336 | "torch.Size([8000, 128]) torch.Size([2000, 128])\n", 337 | "torch.Size([8000, 128])\n", 338 | "tensor([ 101, 2769, 6370, 4638, 3221, 10189, 1039, 4638, 117, 852,\n", 339 | " 2769, 6230, 2533, 8821, 1039, 4638, 7599, 3419, 3291, 1962,\n", 340 | " 671, 763, 117, 3300, 671, 2476, 1377, 809, 1288, 1309,\n", 341 | " 4638, 3763, 1355, 119, 2456, 6379, 1920, 2157, 6370, 3249,\n", 342 | " 6858, 7313, 106, 102, 0, 0, 0, 0, 0, 0,\n", 343 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 344 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 345 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 346 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 347 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 348 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 349 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 350 | " 0, 0, 0, 0, 0, 0, 0, 0])\n", 351 | "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", 352 | " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", 353 | " 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 354 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 355 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 356 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 357 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 358 | " 0., 0.])\n" 359 | ], 360 | "name": "stdout" 361 | } 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "id": "A4w6AQZvgzce", 368 | "colab_type": "text" 369 | }, 370 | "source": [ 371 | "## 2.5 创建DataLoader,用来取出一个batch的数据\n", 372 | "\n", 373 | "TensorDataset 可以用来对 tensor 进行打包,就好像 python 中的 zip 功能。\n", 374 | "\n", 375 | "该类通过每一个 tensor 的第一个维度进行索引,所以该类中的 tensor 第一维度必须相等,且TensorDataset 中的参数必须是 tensor类型。\n", 376 | "\n", 377 | "RandomSampler对数据集随机采样。\n", 378 | "SequentialSampler按顺序对数据集采样。" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "metadata": { 384 | "id": "VBiQ9Bcg2RO4", 385 | "colab_type": "code", 386 | "colab": {} 387 | }, 388 | "source": [ 389 | "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", 390 | "train_sampler = RandomSampler(train_data)\n", 391 | "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)\n", 392 | "\n", 393 | "test_data = TensorDataset(test_inputs, test_masks, test_labels)\n", 394 | "test_sampler = RandomSampler(test_data)\n", 395 | "test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)" 396 | ], 397 | "execution_count": 33, 398 | "outputs": [] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": { 403 | "id": "hJ1tYqiKnqGj", 404 | "colab_type": "text" 405 | }, 406 | "source": [ 407 | "查看一下train_dataloader的内容:" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "metadata": { 413 | "id": "FGWOL5ED2RI-", 414 | "colab_type": "code", 415 | "colab": { 416 | "base_uri": "https://localhost:8080/", 417 | "height": 53 418 | }, 419 | "outputId": "b14082c0-69f7-4f97-9074-bcdda3d34ab3" 420 | }, 421 | "source": [ 422 | "for i, (train, mask, label) in enumerate(train_dataloader): # torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])\n", 423 | " print(train.shape, mask.shape, label.shape)\n", 424 | " break\n", 425 | "\n", 426 | "print('len(train_dataloader) = ', len(train_dataloader)) # 500" 427 | ], 428 | "execution_count": 34, 429 | "outputs": [ 430 | { 431 | "output_type": "stream", 432 | "text": [ 433 | "torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])\n", 434 | "len(train_dataloader) = 500\n" 435 | ], 436 | "name": "stdout" 437 | } 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": { 443 | "id": "Z6PXyoEvofak", 444 | "colab_type": "text" 445 | }, 446 | "source": [ 447 | "# 3. 创建模型、优化器\n", 448 | "\n", 449 | "## 3.1 创建模型" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "metadata": { 455 | "id": "JfcaltZp2RCg", 456 | "colab_type": "code", 457 | "colab": { 458 | "base_uri": "https://localhost:8080/", 459 | "height": 1000 460 | }, 461 | "outputId": "1601dfa1-2f0b-41bb-c488-b19f050194b1" 462 | }, 463 | "source": [ 464 | "model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2) # num_labels表示2个分类,好评和差评\n", 465 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 466 | "model.to(device)" 467 | ], 468 | "execution_count": 35, 469 | "outputs": [ 470 | { 471 | "output_type": "stream", 472 | "text": [ 473 | "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n", 474 | "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n", 475 | "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 476 | "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']\n", 477 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 478 | ], 479 | "name": "stderr" 480 | }, 481 | { 482 | "output_type": "execute_result", 483 | "data": { 484 | "text/plain": [ 485 | "BertForSequenceClassification(\n", 486 | " (bert): BertModel(\n", 487 | " (embeddings): BertEmbeddings(\n", 488 | " (word_embeddings): Embedding(21128, 768, padding_idx=0)\n", 489 | " (position_embeddings): Embedding(512, 768)\n", 490 | " (token_type_embeddings): Embedding(2, 768)\n", 491 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 492 | " (dropout): Dropout(p=0.1, inplace=False)\n", 493 | " )\n", 494 | " (encoder): BertEncoder(\n", 495 | " (layer): ModuleList(\n", 496 | " (0): BertLayer(\n", 497 | " (attention): BertAttention(\n", 498 | " (self): BertSelfAttention(\n", 499 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 500 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 501 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 502 | " (dropout): Dropout(p=0.1, inplace=False)\n", 503 | " )\n", 504 | " (output): BertSelfOutput(\n", 505 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 506 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 507 | " (dropout): Dropout(p=0.1, inplace=False)\n", 508 | " )\n", 509 | " )\n", 510 | " (intermediate): BertIntermediate(\n", 511 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 512 | " )\n", 513 | " (output): BertOutput(\n", 514 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 515 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 516 | " (dropout): Dropout(p=0.1, inplace=False)\n", 517 | " )\n", 518 | " )\n", 519 | " (1): BertLayer(\n", 520 | " (attention): BertAttention(\n", 521 | " (self): BertSelfAttention(\n", 522 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 523 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 524 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 525 | " (dropout): Dropout(p=0.1, inplace=False)\n", 526 | " )\n", 527 | " (output): BertSelfOutput(\n", 528 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 529 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 530 | " (dropout): Dropout(p=0.1, inplace=False)\n", 531 | " )\n", 532 | " )\n", 533 | " (intermediate): BertIntermediate(\n", 534 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 535 | " )\n", 536 | " (output): BertOutput(\n", 537 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 538 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 539 | " (dropout): Dropout(p=0.1, inplace=False)\n", 540 | " )\n", 541 | " )\n", 542 | " (2): BertLayer(\n", 543 | " (attention): BertAttention(\n", 544 | " (self): BertSelfAttention(\n", 545 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 546 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 547 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 548 | " (dropout): Dropout(p=0.1, inplace=False)\n", 549 | " )\n", 550 | " (output): BertSelfOutput(\n", 551 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 552 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 553 | " (dropout): Dropout(p=0.1, inplace=False)\n", 554 | " )\n", 555 | " )\n", 556 | " (intermediate): BertIntermediate(\n", 557 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 558 | " )\n", 559 | " (output): BertOutput(\n", 560 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 561 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 562 | " (dropout): Dropout(p=0.1, inplace=False)\n", 563 | " )\n", 564 | " )\n", 565 | " (3): BertLayer(\n", 566 | " (attention): BertAttention(\n", 567 | " (self): BertSelfAttention(\n", 568 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 569 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 570 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 571 | " (dropout): Dropout(p=0.1, inplace=False)\n", 572 | " )\n", 573 | " (output): BertSelfOutput(\n", 574 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 575 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 576 | " (dropout): Dropout(p=0.1, inplace=False)\n", 577 | " )\n", 578 | " )\n", 579 | " (intermediate): BertIntermediate(\n", 580 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 581 | " )\n", 582 | " (output): BertOutput(\n", 583 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 584 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 585 | " (dropout): Dropout(p=0.1, inplace=False)\n", 586 | " )\n", 587 | " )\n", 588 | " (4): BertLayer(\n", 589 | " (attention): BertAttention(\n", 590 | " (self): BertSelfAttention(\n", 591 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 592 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 593 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 594 | " (dropout): Dropout(p=0.1, inplace=False)\n", 595 | " )\n", 596 | " (output): BertSelfOutput(\n", 597 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 598 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 599 | " (dropout): Dropout(p=0.1, inplace=False)\n", 600 | " )\n", 601 | " )\n", 602 | " (intermediate): BertIntermediate(\n", 603 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 604 | " )\n", 605 | " (output): BertOutput(\n", 606 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 607 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 608 | " (dropout): Dropout(p=0.1, inplace=False)\n", 609 | " )\n", 610 | " )\n", 611 | " (5): BertLayer(\n", 612 | " (attention): BertAttention(\n", 613 | " (self): BertSelfAttention(\n", 614 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 615 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 616 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 617 | " (dropout): Dropout(p=0.1, inplace=False)\n", 618 | " )\n", 619 | " (output): BertSelfOutput(\n", 620 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 621 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 622 | " (dropout): Dropout(p=0.1, inplace=False)\n", 623 | " )\n", 624 | " )\n", 625 | " (intermediate): BertIntermediate(\n", 626 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 627 | " )\n", 628 | " (output): BertOutput(\n", 629 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 630 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 631 | " (dropout): Dropout(p=0.1, inplace=False)\n", 632 | " )\n", 633 | " )\n", 634 | " (6): BertLayer(\n", 635 | " (attention): BertAttention(\n", 636 | " (self): BertSelfAttention(\n", 637 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 638 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 639 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 640 | " (dropout): Dropout(p=0.1, inplace=False)\n", 641 | " )\n", 642 | " (output): BertSelfOutput(\n", 643 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 644 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 645 | " (dropout): Dropout(p=0.1, inplace=False)\n", 646 | " )\n", 647 | " )\n", 648 | " (intermediate): BertIntermediate(\n", 649 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 650 | " )\n", 651 | " (output): BertOutput(\n", 652 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 653 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 654 | " (dropout): Dropout(p=0.1, inplace=False)\n", 655 | " )\n", 656 | " )\n", 657 | " (7): BertLayer(\n", 658 | " (attention): BertAttention(\n", 659 | " (self): BertSelfAttention(\n", 660 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 661 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 662 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 663 | " (dropout): Dropout(p=0.1, inplace=False)\n", 664 | " )\n", 665 | " (output): BertSelfOutput(\n", 666 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 667 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 668 | " (dropout): Dropout(p=0.1, inplace=False)\n", 669 | " )\n", 670 | " )\n", 671 | " (intermediate): BertIntermediate(\n", 672 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 673 | " )\n", 674 | " (output): BertOutput(\n", 675 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 676 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 677 | " (dropout): Dropout(p=0.1, inplace=False)\n", 678 | " )\n", 679 | " )\n", 680 | " (8): BertLayer(\n", 681 | " (attention): BertAttention(\n", 682 | " (self): BertSelfAttention(\n", 683 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 684 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 685 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 686 | " (dropout): Dropout(p=0.1, inplace=False)\n", 687 | " )\n", 688 | " (output): BertSelfOutput(\n", 689 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 690 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 691 | " (dropout): Dropout(p=0.1, inplace=False)\n", 692 | " )\n", 693 | " )\n", 694 | " (intermediate): BertIntermediate(\n", 695 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 696 | " )\n", 697 | " (output): BertOutput(\n", 698 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 699 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 700 | " (dropout): Dropout(p=0.1, inplace=False)\n", 701 | " )\n", 702 | " )\n", 703 | " (9): BertLayer(\n", 704 | " (attention): BertAttention(\n", 705 | " (self): BertSelfAttention(\n", 706 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 707 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 708 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 709 | " (dropout): Dropout(p=0.1, inplace=False)\n", 710 | " )\n", 711 | " (output): BertSelfOutput(\n", 712 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 713 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 714 | " (dropout): Dropout(p=0.1, inplace=False)\n", 715 | " )\n", 716 | " )\n", 717 | " (intermediate): BertIntermediate(\n", 718 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 719 | " )\n", 720 | " (output): BertOutput(\n", 721 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 722 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 723 | " (dropout): Dropout(p=0.1, inplace=False)\n", 724 | " )\n", 725 | " )\n", 726 | " (10): BertLayer(\n", 727 | " (attention): BertAttention(\n", 728 | " (self): BertSelfAttention(\n", 729 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 730 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 731 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 732 | " (dropout): Dropout(p=0.1, inplace=False)\n", 733 | " )\n", 734 | " (output): BertSelfOutput(\n", 735 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 736 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 737 | " (dropout): Dropout(p=0.1, inplace=False)\n", 738 | " )\n", 739 | " )\n", 740 | " (intermediate): BertIntermediate(\n", 741 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 742 | " )\n", 743 | " (output): BertOutput(\n", 744 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 745 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 746 | " (dropout): Dropout(p=0.1, inplace=False)\n", 747 | " )\n", 748 | " )\n", 749 | " (11): BertLayer(\n", 750 | " (attention): BertAttention(\n", 751 | " (self): BertSelfAttention(\n", 752 | " (query): Linear(in_features=768, out_features=768, bias=True)\n", 753 | " (key): Linear(in_features=768, out_features=768, bias=True)\n", 754 | " (value): Linear(in_features=768, out_features=768, bias=True)\n", 755 | " (dropout): Dropout(p=0.1, inplace=False)\n", 756 | " )\n", 757 | " (output): BertSelfOutput(\n", 758 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 759 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 760 | " (dropout): Dropout(p=0.1, inplace=False)\n", 761 | " )\n", 762 | " )\n", 763 | " (intermediate): BertIntermediate(\n", 764 | " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", 765 | " )\n", 766 | " (output): BertOutput(\n", 767 | " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", 768 | " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", 769 | " (dropout): Dropout(p=0.1, inplace=False)\n", 770 | " )\n", 771 | " )\n", 772 | " )\n", 773 | " )\n", 774 | " (pooler): BertPooler(\n", 775 | " (dense): Linear(in_features=768, out_features=768, bias=True)\n", 776 | " (activation): Tanh()\n", 777 | " )\n", 778 | " )\n", 779 | " (dropout): Dropout(p=0.1, inplace=False)\n", 780 | " (classifier): Linear(in_features=768, out_features=2, bias=True)\n", 781 | ")" 782 | ] 783 | }, 784 | "metadata": { 785 | "tags": [] 786 | }, 787 | "execution_count": 35 788 | } 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": { 794 | "id": "V6q1Bw6CqZk8", 795 | "colab_type": "text" 796 | }, 797 | "source": [ 798 | "## 3.2 定义优化器\n", 799 | "\n", 800 | "参数eps是为了提高数值稳定性而添加到分母的一个项(默认: 1e-8)。\n" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "metadata": { 806 | "id": "KTVRhp7ipUfe", 807 | "colab_type": "code", 808 | "colab": {} 809 | }, 810 | "source": [ 811 | "optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)" 812 | ], 813 | "execution_count": 36, 814 | "outputs": [] 815 | }, 816 | { 817 | "cell_type": "markdown", 818 | "metadata": { 819 | "id": "xiOZPF6P4pMP", 820 | "colab_type": "text" 821 | }, 822 | "source": [ 823 | "更通用的写法:bias和LayNorm.weight没有用权重衰减" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "metadata": { 829 | "id": "2lW1VZyWqzYW", 830 | "colab_type": "code", 831 | "colab": {} 832 | }, 833 | "source": [ 834 | "no_decay = ['bias', 'LayerNorm.weight']\n", 835 | "optimizer_grouped_parameters = [\n", 836 | " {'params' : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", 837 | " 'weight_decay' : weight_decay},\n", 838 | " {'params' : [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n", 839 | " 'weight_decay' : 0.0}\n", 840 | "]\n", 841 | "\n", 842 | "optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = epsilon)" 843 | ], 844 | "execution_count": 42, 845 | "outputs": [] 846 | }, 847 | { 848 | "cell_type": "markdown", 849 | "metadata": { 850 | "id": "VXe2LYE04Ri8", 851 | "colab_type": "text" 852 | }, 853 | "source": [ 854 | " ## 3.3 学习率预热,训练时先从小的学习率开始训练" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "metadata": { 860 | "id": "XhYuopvX2yhB", 861 | "colab_type": "code", 862 | "colab": {} 863 | }, 864 | "source": [ 865 | "epochs = 2\n", 866 | "# training steps 的数量: [number of batches] x [number of epochs].\n", 867 | "total_steps = len(train_dataloader) * epochs\n", 868 | "\n", 869 | "# 设计 learning rate scheduler.\n", 870 | "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, \n", 871 | " num_training_steps = total_steps)" 872 | ], 873 | "execution_count": 44, 874 | "outputs": [] 875 | }, 876 | { 877 | "cell_type": "markdown", 878 | "metadata": { 879 | "id": "c2-LLDoW6cZ7", 880 | "colab_type": "text" 881 | }, 882 | "source": [ 883 | "# 4.训练、评估模型 \n", 884 | "\n", 885 | "## 4.1 模型准确率\n" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "metadata": { 891 | "id": "WFl32Ko724PI", 892 | "colab_type": "code", 893 | "colab": {} 894 | }, 895 | "source": [ 896 | "def binary_acc(preds, labels):\n", 897 | " correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float() # eq里面的两个参数的shape=torch.Size([16])\n", 898 | " acc = correct.sum().item() / len(correct)\n", 899 | " return acc" 900 | ], 901 | "execution_count": 45, 902 | "outputs": [] 903 | }, 904 | { 905 | "cell_type": "markdown", 906 | "metadata": { 907 | "id": "1i32sHMh_GqR", 908 | "colab_type": "text" 909 | }, 910 | "source": [ 911 | "## 4.2 计算模型运行时间" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "metadata": { 917 | "id": "L2CJDlV4-T2z", 918 | "colab_type": "code", 919 | "colab": {} 920 | }, 921 | "source": [ 922 | "import time\n", 923 | "import datetime\n", 924 | "\n", 925 | "def format_time(elapsed):\n", 926 | " elapsed_rounded = int(round(elapsed))\n", 927 | " return str(datetime.timedelta(seconds = elapsed_rounded)) # 返回 hh:mm:ss 形式的时间" 928 | ], 929 | "execution_count": 46, 930 | "outputs": [] 931 | }, 932 | { 933 | "cell_type": "markdown", 934 | "metadata": { 935 | "id": "92O44C1-_0G-", 936 | "colab_type": "text" 937 | }, 938 | "source": [ 939 | "## 4.3 训练模型\n", 940 | "\n", 941 | "- 传入model的参数必须是tensor类型的;\n", 942 | "\n", 943 | "- nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=2)用于解决神经网络训练过拟合的方法 ;\n", 944 | "\n", 945 | "输入是(NN参数,最大梯度范数,范数类型=2) 一般默认为L2 范数;\n", 946 | "\n", 947 | "Tip: 注意这个方法只在训练的时候使用,在测试的时候不用;" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "metadata": { 953 | "id": "iGUJfpT6_rjg", 954 | "colab_type": "code", 955 | "colab": {} 956 | }, 957 | "source": [ 958 | "def train(model, optimizer):\n", 959 | " t0 = time.time()\n", 960 | " avg_loss, avg_acc = [],[]\n", 961 | "\n", 962 | " model.train()\n", 963 | " for step, batch in enumerate(train_dataloader):\n", 964 | "\n", 965 | " # 每隔40个batch 输出一下所用时间.\n", 966 | " if step % 40 == 0 and not step == 0:\n", 967 | " elapsed = format_time(time.time() - t0)\n", 968 | " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", 969 | "\n", 970 | " b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)\n", 971 | "\n", 972 | " output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)\n", 973 | " loss, logits = output[0], output[1]\n", 974 | "\n", 975 | " avg_loss.append(loss.item())\n", 976 | "\n", 977 | " acc = binary_acc(logits, b_labels)\n", 978 | " avg_acc.append(acc)\n", 979 | "\n", 980 | " optimizer.zero_grad()\n", 981 | " loss.backward()\n", 982 | " clip_grad_norm_(model.parameters(), 1.0) #大于1的梯度将其设为1.0, 以防梯度爆炸\n", 983 | " optimizer.step() #更新模型参数\n", 984 | " scheduler.step() #更新learning rate\n", 985 | "\n", 986 | " avg_acc = np.array(avg_acc).mean()\n", 987 | " avg_loss = np.array(avg_loss).mean()\n", 988 | " return avg_loss, avg_acc" 989 | ], 990 | "execution_count": 47, 991 | "outputs": [] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "metadata": { 996 | "id": "hGIZrUVhBVDR", 997 | "colab_type": "text" 998 | }, 999 | "source": [ 1000 | "此处output的形式为(元组类型,第0个元素是loss值,第1个元素是每个batch中好评和差评的概率):" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "markdown", 1005 | "metadata": { 1006 | "id": "qDLQ9RobB9lt", 1007 | "colab_type": "text" 1008 | }, 1009 | "source": [ 1010 | "```\n", 1011 | "(tensor(0.0210, device='cuda:0', grad_fn=), \n", 1012 | "tensor([[-2.9815, 2.6931],\n", 1013 | " [-3.2380, 3.1935],\n", 1014 | " [-3.0775, 3.0713],\n", 1015 | " [ 3.0191, -2.3689],\n", 1016 | " [ 3.1146, -2.7957],\n", 1017 | " [ 3.7798, -2.7410],\n", 1018 | " [-0.3273, 0.8227],\n", 1019 | " [ 2.5012, -1.5535],\n", 1020 | " [-3.0231, 3.0162],\n", 1021 | " [ 3.4146, -2.5582],\n", 1022 | " [ 3.3104, -2.2134],\n", 1023 | " [ 3.3776, -2.5190],\n", 1024 | " [-2.6513, 2.5108],\n", 1025 | " [-3.3691, 2.9516],\n", 1026 | " [ 3.2397, -2.0473],\n", 1027 | " [-2.8622, 2.7395]], device='cuda:0', grad_fn=))\n", 1028 | "```\n" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": { 1034 | "id": "RW6qjSeBBfhk", 1035 | "colab_type": "text" 1036 | }, 1037 | "source": [ 1038 | "## 4.4 评估模型\n", 1039 | "\n", 1040 | "调用model模型时不传入label值。" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "metadata": { 1046 | "id": "O5ELFI4bBQei", 1047 | "colab_type": "code", 1048 | "colab": {} 1049 | }, 1050 | "source": [ 1051 | "def evaluate(model):\n", 1052 | " avg_acc = []\n", 1053 | " model.eval() #表示进入测试模式\n", 1054 | "\n", 1055 | " with torch.no_grad():\n", 1056 | " for batch in test_dataloader:\n", 1057 | " b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)\n", 1058 | "\n", 1059 | " output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n", 1060 | "\n", 1061 | " acc = binary_acc(output[0], b_labels)\n", 1062 | " avg_acc.append(acc)\n", 1063 | " avg_acc = np.array(avg_acc).mean()\n", 1064 | " return avg_acc" 1065 | ], 1066 | "execution_count": 49, 1067 | "outputs": [] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "metadata": { 1072 | "id": "-rZvcWp4BytL", 1073 | "colab_type": "text" 1074 | }, 1075 | "source": [ 1076 | "此处output的形式为(元组类型,第0个元素是每个batch中好评和差评的概率):" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "markdown", 1081 | "metadata": { 1082 | "id": "i5ZiiaTjB0iu", 1083 | "colab_type": "text" 1084 | }, 1085 | "source": [ 1086 | "```\n", 1087 | "(tensor([[ 3.8217, -2.7516],\n", 1088 | " [ 2.7585, -2.0853],\n", 1089 | " [-2.9317, 2.9092],\n", 1090 | " [-3.3724, 3.2597],\n", 1091 | " [-2.8692, 2.6741],\n", 1092 | " [-3.2784, 2.9276],\n", 1093 | " [ 3.4946, -2.8895],\n", 1094 | " [ 3.7855, -2.8623],\n", 1095 | " [-2.2249, 2.4336],\n", 1096 | " [-2.4257, 2.4606],\n", 1097 | " [ 3.3996, -2.5760],\n", 1098 | " [-3.1986, 3.0841],\n", 1099 | " [ 3.6883, -2.9492],\n", 1100 | " [ 3.2883, -2.3600],\n", 1101 | " [ 2.6723, -2.0778],\n", 1102 | " [-3.1868, 3.1106]], device='cuda:0'),)\n", 1103 | "```" 1104 | ] 1105 | }, 1106 | { 1107 | "cell_type": "markdown", 1108 | "metadata": { 1109 | "id": "5EkPkI5eCHRo", 1110 | "colab_type": "text" 1111 | }, 1112 | "source": [ 1113 | "## 4.5 运行训练模型和评估模型" 1114 | ] 1115 | }, 1116 | { 1117 | "cell_type": "code", 1118 | "metadata": { 1119 | "id": "1jVbe00RBSLf", 1120 | "colab_type": "code", 1121 | "colab": { 1122 | "base_uri": "https://localhost:8080/", 1123 | "height": 521 1124 | }, 1125 | "outputId": "cee6bed2-7f35-4b36-ca5e-9be3f0e1501c" 1126 | }, 1127 | "source": [ 1128 | "for epoch in range(epochs):\n", 1129 | "\n", 1130 | " train_loss, train_acc = train(model, optimizer)\n", 1131 | " print('epoch={},训练准确率={},损失={}'.format(epoch, train_acc, train_loss))\n", 1132 | " test_acc = evaluate(model)\n", 1133 | " print(\"epoch={},测试准确率={}\".format(epoch, test_acc))" 1134 | ], 1135 | "execution_count": 50, 1136 | "outputs": [ 1137 | { 1138 | "output_type": "stream", 1139 | "text": [ 1140 | " Batch 40 of 500. Elapsed: 0:00:27.\n", 1141 | " Batch 80 of 500. Elapsed: 0:00:53.\n", 1142 | " Batch 120 of 500. Elapsed: 0:01:20.\n", 1143 | " Batch 160 of 500. Elapsed: 0:01:47.\n", 1144 | " Batch 200 of 500. Elapsed: 0:02:14.\n", 1145 | " Batch 240 of 500. Elapsed: 0:02:40.\n", 1146 | " Batch 280 of 500. Elapsed: 0:03:07.\n", 1147 | " Batch 320 of 500. Elapsed: 0:03:34.\n", 1148 | " Batch 360 of 500. Elapsed: 0:04:01.\n", 1149 | " Batch 400 of 500. Elapsed: 0:04:28.\n", 1150 | " Batch 440 of 500. Elapsed: 0:04:55.\n", 1151 | " Batch 480 of 500. Elapsed: 0:05:22.\n", 1152 | "epoch=0,训练准确率=0.90275,损失=0.2619755164962262\n", 1153 | "epoch=0,测试准确率=0.9325\n", 1154 | " Batch 40 of 500. Elapsed: 0:00:27.\n", 1155 | " Batch 80 of 500. Elapsed: 0:00:53.\n", 1156 | " Batch 120 of 500. Elapsed: 0:01:20.\n", 1157 | " Batch 160 of 500. Elapsed: 0:01:47.\n", 1158 | " Batch 200 of 500. Elapsed: 0:02:14.\n", 1159 | " Batch 240 of 500. Elapsed: 0:02:41.\n", 1160 | " Batch 280 of 500. Elapsed: 0:03:08.\n", 1161 | " Batch 320 of 500. Elapsed: 0:03:35.\n", 1162 | " Batch 360 of 500. Elapsed: 0:04:02.\n", 1163 | " Batch 400 of 500. Elapsed: 0:04:28.\n", 1164 | " Batch 440 of 500. Elapsed: 0:04:55.\n", 1165 | " Batch 480 of 500. Elapsed: 0:05:22.\n", 1166 | "epoch=1,训练准确率=0.953375,损失=0.15345162890665234\n", 1167 | "epoch=1,测试准确率=0.9435\n" 1168 | ], 1169 | "name": "stdout" 1170 | } 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "markdown", 1175 | "metadata": { 1176 | "id": "QchQuTWDCM6M", 1177 | "colab_type": "text" 1178 | }, 1179 | "source": [ 1180 | "# 5. 预测" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "metadata": { 1186 | "id": "XpM5amxqCKbk", 1187 | "colab_type": "code", 1188 | "colab": { 1189 | "base_uri": "https://localhost:8080/", 1190 | "height": 125 1191 | }, 1192 | "outputId": "ff51d8f9-7797-480a-d326-f54c1c0eebc9" 1193 | }, 1194 | "source": [ 1195 | "def predict(sen):\n", 1196 | "\n", 1197 | " input_id = convert_text_to_token(tokenizer, sen)\n", 1198 | " input_token = torch.tensor(input_id).long().to(device) #torch.Size([128])\n", 1199 | "\n", 1200 | " atten_mask = [float(i>0) for i in input_id]\n", 1201 | " attention_token = torch.tensor(atten_mask).long().to(device) #torch.Size([128])\n", 1202 | "\n", 1203 | " output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_token.view(1, -1)) #torch.Size([128])->torch.Size([1, 128])否则会报错\n", 1204 | " print(output[0])\n", 1205 | "\n", 1206 | " return torch.max(output[0], dim=1)[1]\n", 1207 | "\n", 1208 | "label = predict('酒店位置难找,环境不太好,隔音差,下次不会再来的。')\n", 1209 | "print('好评' if label==1 else '差评')\n", 1210 | "\n", 1211 | "label = predict('酒店还可以,接待人员很热情,卫生合格,空间也比较大,不足的地方就是没有窗户')\n", 1212 | "print('好评' if label==1 else '差评')\n", 1213 | "\n", 1214 | "label = predict('\"服务各方面没有不周到的地方, 各方面没有没想到的细节\"')\n", 1215 | "print('好评' if label==1 else '差评')" 1216 | ], 1217 | "execution_count": 51, 1218 | "outputs": [ 1219 | { 1220 | "output_type": "stream", 1221 | "text": [ 1222 | "tensor([[ 2.3040, -4.0122]], device='cuda:0', grad_fn=)\n", 1223 | "差评\n", 1224 | "tensor([[-1.5570, 2.5071]], device='cuda:0', grad_fn=)\n", 1225 | "好评\n", 1226 | "tensor([[ 0.3791, -1.3262]], device='cuda:0', grad_fn=)\n", 1227 | "差评\n" 1228 | ], 1229 | "name": "stdout" 1230 | } 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "metadata": { 1236 | "id": "QsTJVQSrD9XR", 1237 | "colab_type": "code", 1238 | "colab": {} 1239 | }, 1240 | "source": [ 1241 | "" 1242 | ], 1243 | "execution_count": null, 1244 | "outputs": [] 1245 | } 1246 | ] 1247 | } -------------------------------------------------------------------------------- /Bert手写版本+MLM+NSP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.9" 21 | }, 22 | "nbTranslate": { 23 | "displayLangs": [ 24 | "*" 25 | ], 26 | "hotkey": "alt-t", 27 | "langInMainMenu": true, 28 | "sourceLang": "en", 29 | "targetLang": "cn", 30 | "useGoogleTranslate": true 31 | }, 32 | "toc": { 33 | "base_numbering": 1, 34 | "nav_menu": { 35 | "height": "168.991px", 36 | "width": "201.634px" 37 | }, 38 | "number_sections": true, 39 | "sideBar": true, 40 | "skip_h1_title": false, 41 | "title_cell": "Table of Contents", 42 | "title_sidebar": "Contents", 43 | "toc_cell": false, 44 | "toc_position": { 45 | "height": "calc(100% - 180px)", 46 | "left": "10px", 47 | "top": "150px", 48 | "width": "191.094px" 49 | }, 50 | "toc_section_display": true, 51 | "toc_window_display": true 52 | }, 53 | "varInspector": { 54 | "cols": { 55 | "lenName": 16, 56 | "lenType": 16, 57 | "lenVar": 40 58 | }, 59 | "kernels_config": { 60 | "python": { 61 | "delete_cmd_postfix": "", 62 | "delete_cmd_prefix": "del ", 63 | "library": "var_list.py", 64 | "varRefreshCmd": "print(var_dic_list())" 65 | }, 66 | "r": { 67 | "delete_cmd_postfix": ") ", 68 | "delete_cmd_prefix": "rm(", 69 | "library": "var_list.r", 70 | "varRefreshCmd": "cat(var_dic_list()) " 71 | } 72 | }, 73 | "position": { 74 | "height": "398px", 75 | "left": "840px", 76 | "right": "20px", 77 | "top": "77px", 78 | "width": "505px" 79 | }, 80 | "types_to_exclude": [ 81 | "module", 82 | "function", 83 | "builtin_function_or_method", 84 | "instance", 85 | "_Feature" 86 | ], 87 | "window_display": false 88 | }, 89 | "colab": { 90 | "name": "Bert手写版本+MLM+NSP.ipynb", 91 | "provenance": [], 92 | "include_colab_link": true 93 | } 94 | }, 95 | "cells": [ 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "id": "view-in-github", 100 | "colab_type": "text" 101 | }, 102 | "source": [ 103 | "\"Open" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "id": "RUG6EgDHHOWm" 110 | }, 111 | "source": [ 112 | "# Bert手写版本+MLM+NSP" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "ExecuteTime": { 119 | "end_time": "2020-10-11T14:16:40.637738Z", 120 | "start_time": "2020-10-11T14:16:39.974576Z" 121 | }, 122 | "id": "1xxvnLCcHOWq" 123 | }, 124 | "source": [ 125 | "import re\n", 126 | "import math\n", 127 | "import torch\n", 128 | "import numpy as np\n", 129 | "from random import *\n", 130 | "import torch.nn as nn\n", 131 | "import torch.nn.functional as F \n", 132 | "import torch.optim as optim\n", 133 | "import torch.utils.data as Data" 134 | ], 135 | "execution_count": null, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "id": "7ZA_5gbPHOWz" 142 | }, 143 | "source": [ 144 | "# 数据预处理\n", 145 | "\n", 146 | "## 构造单词表和映射" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "metadata": { 152 | "ExecuteTime": { 153 | "end_time": "2020-10-11T14:16:40.677834Z", 154 | "start_time": "2020-10-11T14:16:40.670965Z" 155 | }, 156 | "code_folding": [], 157 | "id": "xGnYUQDkHOW1" 158 | }, 159 | "source": [ 160 | "text = (\n", 161 | " 'Hello, how are you? I am Romeo.\\n' # R\n", 162 | " 'Hello, Romeo My name is Juliet. Nice to meet you.\\n' # J\n", 163 | " 'Nice to meet you too. How are you today?\\n' # R\n", 164 | " 'Great. My baseball team won the competition.\\n' # J\n", 165 | " 'Oh Congratulations, Juliet\\n' # R\n", 166 | " 'Thank you Romeo\\n' # J\n", 167 | " 'Where are you going today?\\n' # R\n", 168 | " 'I am going shopping. What about you?\\n' # J\n", 169 | " 'I am going to visit my grandmother. she is not very well' # R\n", 170 | ")\n", 171 | "sentences = re.sub(\"[.,!?\\\\-]\", '', text.lower()).split('\\n') # filter '.', ',', '?', '!'\n", 172 | "\n", 173 | "# 所有句子的单词list\n", 174 | "word_list = list(set(\" \".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]\n", 175 | "\n", 176 | "# 给单词表中所有单词设置序号\n", 177 | "word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}\n", 178 | "for i, w in enumerate(word_list):\n", 179 | " word2idx[w] = i + 4\n", 180 | "\n", 181 | "# 用于 idx 映射回 word\n", 182 | "idx2word = {i: w for i, w in enumerate(word2idx)}\n", 183 | "vocab_size = len(word2idx) # 40\n", 184 | "\n", 185 | "# token: 就是每个单词在词表中的index\n", 186 | "token_list = list() # token_list存储了每一句的token\n", 187 | "for sentence in sentences:\n", 188 | " arr = [word2idx[s] for s in sentence.split()]\n", 189 | " token_list.append(arr)" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "ExecuteTime": { 198 | "end_time": "2020-10-11T14:16:40.718513Z", 199 | "start_time": "2020-10-11T14:16:40.712136Z" 200 | }, 201 | "lang": "en", 202 | "id": "5SfmKM2YHOXA", 203 | "outputId": "266df375-d9d6-4feb-fee3-64cb3b301564" 204 | }, 205 | "source": [ 206 | "print(sentences[1]) # hello romeo my name is juliet nice to meet you\n", 207 | "print(token_list[1]) # [14, 31, 35, 33, 27, 11, 8, 16, 5, 34]" 208 | ], 209 | "execution_count": null, 210 | "outputs": [ 211 | { 212 | "output_type": "stream", 213 | "text": [ 214 | "hello romeo my name is juliet nice to meet you\n", 215 | "[38, 14, 23, 15, 24, 30, 5, 13, 39, 19]\n" 216 | ], 217 | "name": "stdout" 218 | } 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "YRXMsY82HOXM" 225 | }, 226 | "source": [ 227 | "## 设置超参数" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "metadata": { 233 | "ExecuteTime": { 234 | "end_time": "2020-10-11T14:16:41.118824Z", 235 | "start_time": "2020-10-11T14:16:41.113460Z" 236 | }, 237 | "id": "XAGo9iSDHOXN" 238 | }, 239 | "source": [ 240 | "maxlen = 30 # 句子pad到的最大长度,即下面句子中的seq_len\n", 241 | "batch_size = 6 \n", 242 | "\n", 243 | "max_pred = 5 # max tokens of prediction\n", 244 | "n_layers = 6 # Bert中Transformer的层数\n", 245 | "n_heads = 12 # Multi-head的数量\n", 246 | "d_model = 768 # 即embedding_dim\n", 247 | "d_ff = 768*4 # 4*d_model, FeedForward dimension\n", 248 | "d_k = d_v = 64 # dimension of K(=Q), V,是d_model分割成n_heads之后的长度, 768 // 12 = 64\n", 249 | "\n", 250 | "n_segments = 2 # 分隔句子数" 251 | ], 252 | "execution_count": null, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "id": "xYnGPYp2HOXU" 259 | }, 260 | "source": [ 261 | "# 实现Dataloader\n", 262 | "\n", 263 | "## 生成data\n", 264 | "\n", 265 | "- 选中语料中所有词的**15%**进行随机mask\n", 266 | "\n", 267 | "- 在确定要Mask掉的单词之后:\n", 268 | "\n", 269 | " - 选中的单词,在80%的概率下被用 [MASK] 来代替\n", 270 | " \n", 271 | " - 选中的单词,在10%的概率下不做mask,用任意非标记词代替\n", 272 | " \n", 273 | " - 选中的单词,在10%的概率下不做mask,仍然保留原来真实的词" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "ExecuteTime": { 280 | "end_time": "2020-10-11T14:16:41.796706Z", 281 | "start_time": "2020-10-11T14:16:41.777687Z" 282 | }, 283 | "id": "iI0sULeoHOXW" 284 | }, 285 | "source": [ 286 | "# sample IsNext and NotNext to be same in small batch size\n", 287 | "def make_data():\n", 288 | " batch = []\n", 289 | " positive = negative = 0\n", 290 | " while (positive != batch_size / 2) or (negative != batch_size / 2):\n", 291 | " # ==========================BERT 的 input 表示================================\n", 292 | " # 随机取两个句子的index\n", 293 | " tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences\n", 294 | " # 随机取两个句子\n", 295 | " tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]\n", 296 | " # Token (没有使用word piece): 单词在词典中的编码 \n", 297 | " input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]\n", 298 | " # Segment: 区分两个句子的编码(上句全为0 (CLS~SEP),下句全为1)\n", 299 | " segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n", 300 | " \n", 301 | " # ========================== MASK LM ==========================================\n", 302 | " n_pred = min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence\n", 303 | " # token在 input_ids 中的下标(不包括[CLS], [SEP])\n", 304 | " cand_maked_pos = [i for i, token in enumerate(input_ids) \n", 305 | " if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position\n", 306 | " shuffle(cand_maked_pos)\n", 307 | " \n", 308 | " masked_tokens, masked_pos = [], [] # 被mask的tokens,被mask的tokens的索引号\n", 309 | " for pos in cand_maked_pos[:n_pred]: # 随机mask 15% 的tokens\n", 310 | " masked_pos.append(pos)\n", 311 | " masked_tokens.append(input_ids[pos])\n", 312 | " # 选定要mask的词\n", 313 | " if random() < 0.8: # 80%:被真实mask\n", 314 | " input_ids[pos] = word2idx['[MASK]']\n", 315 | " elif random() > 0.9: # 10%\n", 316 | " index = randint(0, vocab_size - 1) # random index in vocabulary\n", 317 | " while index < 4: # 不能是 [PAD], [CLS], [SEP], [MASK]\n", 318 | " index = randint(0, vocab_size - 1)\n", 319 | " input_ids[pos] = index # 10%:不做mask,用任意非标记词代替\n", 320 | " # 还有10%:不做mask,什么也不做\n", 321 | " \n", 322 | " # =========================== Paddings ========================================\n", 323 | " # input_ids全部padding到相同的长度\n", 324 | " n_pad = maxlen - len(input_ids)\n", 325 | " input_ids.extend([word2idx['[PAD]']] * n_pad)\n", 326 | " segment_ids.extend([word2idx['[PAD]']] * n_pad)\n", 327 | " \n", 328 | " # zero padding (100% - 15%) tokens\n", 329 | " if max_pred > n_pred:\n", 330 | " n_pad = max_pred - n_pred\n", 331 | " masked_tokens.extend([0] * n_pad)\n", 332 | " masked_pos.extend([0] * n_pad)\n", 333 | " \n", 334 | " # =====================batch添加数据, 让正例 和 负例 数量相同=======================\n", 335 | " if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:\n", 336 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n", 337 | " positive += 1\n", 338 | " elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:\n", 339 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n", 340 | " negative += 1\n", 341 | " \n", 342 | " return batch" 343 | ], 344 | "execution_count": null, 345 | "outputs": [] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "ExecuteTime": { 351 | "end_time": "2020-10-11T07:12:34.813381Z", 352 | "start_time": "2020-10-11T07:12:34.807625Z" 353 | }, 354 | "id": "8TdXloLHHOXd" 355 | }, 356 | "source": [ 357 | "调用上面函数:" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "metadata": { 363 | "ExecuteTime": { 364 | "end_time": "2020-10-11T14:16:41.987896Z", 365 | "start_time": "2020-10-11T14:16:41.980913Z" 366 | }, 367 | "id": "bptQwzl7HOXh", 368 | "outputId": "fd09cb98-b360-454c-9a2d-45922f91a25f" 369 | }, 370 | "source": [ 371 | "batch = make_data()\n", 372 | "\n", 373 | "input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch) \n", 374 | "print(len(isNext))\n", 375 | "# # 全部要转成LongTensor类型\n", 376 | "# input_ids, segment_ids, masked_tokens, masked_pos, isNext = \\\n", 377 | "# torch.LongTensor(input_ids), torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens), \\\n", 378 | "# torch.LongTensor(masked_pos), torch.LongTensor(isNext)" 379 | ], 380 | "execution_count": null, 381 | "outputs": [ 382 | { 383 | "output_type": "stream", 384 | "text": [ 385 | "6\n" 386 | ], 387 | "name": "stdout" 388 | } 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": { 394 | "id": "2nZBtL7dHOXx" 395 | }, 396 | "source": [ 397 | "## 生成DataLoader\n", 398 | "\n", 399 | "- 为了使用dataloader,我们需要定义以下两个function:\n", 400 | "\n", 401 | " - `__len__` function:需要返回整个数据集中有多少个item\n", 402 | " \n", 403 | " - `__get__ `:根据给定的index返回一个item\n", 404 | " \n", 405 | "有了dataloader之后,我们可以轻松随机打乱整个数据集,拿到一个batch的数据等等。" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "metadata": { 411 | "ExecuteTime": { 412 | "end_time": "2020-10-11T14:16:42.699706Z", 413 | "start_time": "2020-10-11T14:16:42.689736Z" 414 | }, 415 | "id": "SrcLTns2HOX2" 416 | }, 417 | "source": [ 418 | "class MyDataSet(Data.Dataset):\n", 419 | " def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):\n", 420 | " # 全部要转成LongTensor类型\n", 421 | " self.input_ids = torch.LongTensor(input_ids)\n", 422 | " self.segment_ids = torch.LongTensor(segment_ids)\n", 423 | " self.masked_tokens = torch.LongTensor(masked_tokens) \n", 424 | " self.masked_pos = torch.LongTensor(masked_pos) \n", 425 | " self.isNext = torch.LongTensor(isNext)\n", 426 | " \n", 427 | " def __len__(self):\n", 428 | " return len(self.input_ids)\n", 429 | " \n", 430 | " def __getitem__(self, idx):\n", 431 | " return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]\n", 432 | " \n", 433 | "dataset = MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext)\n", 434 | "dataloader = Data.DataLoader(dataset, batch_size=batch_size, shuffle=True)" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "ExecuteTime": { 443 | "end_time": "2020-10-11T14:16:42.946111Z", 444 | "start_time": "2020-10-11T14:16:42.932823Z" 445 | }, 446 | "scrolled": false, 447 | "id": "4OpB-jmyHOYC", 448 | "outputId": "82008c10-e44d-4636-c45e-08870a72ff5d" 449 | }, 450 | "source": [ 451 | "print(next(iter(dataloader)))\n", 452 | "print(len(dataloader)) # 就一个batch" 453 | ], 454 | "execution_count": null, 455 | "outputs": [ 456 | { 457 | "output_type": "stream", 458 | "text": [ 459 | "[tensor([[ 1, 36, 23, 9, 16, 33, 3, 18, 2, 31, 21, 30, 2, 0, 0, 0, 0, 0,\n", 460 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 461 | " [ 1, 36, 23, 9, 16, 33, 3, 18, 2, 22, 8, 6, 13, 28, 23, 34, 3, 24,\n", 462 | " 11, 27, 37, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n", 463 | " [ 1, 22, 8, 6, 3, 35, 12, 19, 2, 5, 13, 39, 19, 10, 25, 26, 19, 17,\n", 464 | " 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 465 | " [ 1, 38, 14, 23, 15, 24, 30, 5, 13, 39, 19, 2, 38, 14, 23, 15, 24, 30,\n", 466 | " 5, 13, 3, 19, 2, 0, 0, 0, 0, 0, 0, 0],\n", 467 | " [ 1, 29, 26, 19, 6, 3, 2, 22, 8, 6, 32, 35, 3, 19, 2, 0, 0, 0,\n", 468 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 469 | " [ 1, 38, 14, 23, 15, 24, 30, 5, 13, 39, 19, 2, 5, 13, 39, 19, 10, 25,\n", 470 | " 3, 19, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0]]), tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 471 | " 0, 0, 0, 0, 0, 0],\n", 472 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,\n", 473 | " 0, 0, 0, 0, 0, 0],\n", 474 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,\n", 475 | " 0, 0, 0, 0, 0, 0],\n", 476 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,\n", 477 | " 0, 0, 0, 0, 0, 0],\n", 478 | " [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 479 | " 0, 0, 0, 0, 0, 0],\n", 480 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,\n", 481 | " 0, 0, 0, 0, 0, 0]]), tensor([[20, 0, 0, 0, 0],\n", 482 | " [ 4, 20, 23, 0, 0],\n", 483 | " [12, 32, 0, 0, 0],\n", 484 | " [14, 14, 39, 0, 0],\n", 485 | " [17, 12, 0, 0, 0],\n", 486 | " [17, 39, 26, 0, 0]]), tensor([[ 6, 0, 0, 0, 0],\n", 487 | " [16, 6, 14, 0, 0],\n", 488 | " [ 6, 4, 0, 0, 0],\n", 489 | " [ 2, 13, 20, 0, 0],\n", 490 | " [ 5, 12, 0, 0, 0],\n", 491 | " [20, 9, 18, 0, 0]]), tensor([1, 0, 0, 0, 1, 1])]\n", 492 | "1\n" 493 | ], 494 | "name": "stdout" 495 | } 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": { 501 | "ExecuteTime": { 502 | "end_time": "2020-10-11T11:40:34.174618Z", 503 | "start_time": "2020-10-11T11:40:34.168686Z" 504 | }, 505 | "id": "vci2w21zHOYO" 506 | }, 507 | "source": [ 508 | "# Bert模型\n", 509 | "\n", 510 | "## Embedding" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "metadata": { 516 | "ExecuteTime": { 517 | "end_time": "2020-10-11T14:16:43.711528Z", 518 | "start_time": "2020-10-11T14:16:43.699319Z" 519 | }, 520 | "id": "zLE6v5d5HOYT" 521 | }, 522 | "source": [ 523 | "class BertEmbedding(nn.Module):\n", 524 | " def __init__(self):\n", 525 | " super(BertEmbedding, self).__init__()\n", 526 | " # d_model:即embedding_dim\n", 527 | " # token embedding\n", 528 | " self.tok_embed = nn.Embedding(vocab_size, d_model) \n", 529 | "\n", 530 | " # position embedding: 这里简写了,源码中位置编码使用了sin,cos\n", 531 | "# self.pos_embed = nn.Embedding(maxlen, d_model) \n", 532 | " self.pos_embed = torch.tensor(\n", 533 | " [[pos / (10000.0 ** (i // 2 * 2.0 / d_model)) for i in range(d_model)] for pos in range(maxlen)]\n", 534 | " )\n", 535 | " self.pos_embed[:, 0::2] = torch.sin(self.pos_embed[:, 0::2])\n", 536 | " self.pos_embed[:, 1::2] = torch.cos(self.pos_embed[:, 1::2])\n", 537 | " \n", 538 | " # segment embedding\n", 539 | " self.seg_embed = nn.Embedding(n_segments, d_model) # segment(token type) embedding\n", 540 | "\n", 541 | " # LayerNorm\n", 542 | " self.norm = nn.LayerNorm(d_model)\n", 543 | " \n", 544 | " def forward(self, x, seq): # x 和 pos的shape 都是[batch_size, seq_len]\n", 545 | "\n", 546 | "# seq_len = x.size(1) \n", 547 | "# pos = torch.arange(seq_len, dtype=torch.long)\n", 548 | " # unsqueeze(0): 在索引0处,增加维度--> [1, seq_len]\n", 549 | " # expand: 某个 size=1 的维度上扩展到size\n", 550 | " # expand_as: 把一个tensor变成和函数括号内一样形状的tensor\n", 551 | "# pos = pos.unsqueeze(0).expand_as(x) # [seq_len] -> [batch_size, seq_len]\n", 552 | " \n", 553 | " # 三个embedding相加\n", 554 | " input_embedding = self.tok_embed(x) + nn.Parameter(self.pos_embed, requires_grad=False) + self.seg_embed(seq)\n", 555 | " \n", 556 | " return self.norm(input_embedding)" 557 | ], 558 | "execution_count": null, 559 | "outputs": [] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": { 564 | "ExecuteTime": { 565 | "end_time": "2020-10-11T12:53:38.450646Z", 566 | "start_time": "2020-10-11T12:53:38.427512Z" 567 | }, 568 | "id": "mMIuvYhCHOYh" 569 | }, 570 | "source": [ 571 | "## 生成mask" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "metadata": { 577 | "ExecuteTime": { 578 | "end_time": "2020-10-11T14:16:44.304800Z", 579 | "start_time": "2020-10-11T14:16:44.299614Z" 580 | }, 581 | "id": "saCDe7HPHOYw" 582 | }, 583 | "source": [ 584 | "# Padding的部分不应该计算概率,所以需要在相应位置设置mask\n", 585 | "# mask==0的内容填充1e-9,使得计算softmax时概率接近0\n", 586 | "# 在计算attention时,使用\n", 587 | "def get_attn_pad_mask(seq_q, seq_k): # seq_q 和 seq_k 的 shape 都是 [batch_size, seq_len]\n", 588 | " batch_size, seq_len = seq_q.size()\n", 589 | " # eq(zero) is PAD token\n", 590 | " pad_attn_mask = seq_q.data.eq(0).unsqueeze(1) # [batcb_size, 1, seq_len]\n", 591 | " return pad_attn_mask.expand(batch_size, seq_len, seq_len) # [batch_size, seq_len, seq_len]" 592 | ], 593 | "execution_count": null, 594 | "outputs": [] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": { 599 | "id": "DaYxEVEoHOY8" 600 | }, 601 | "source": [ 602 | "## 构建激活函数" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "metadata": { 608 | "ExecuteTime": { 609 | "end_time": "2020-10-11T14:16:45.236139Z", 610 | "start_time": "2020-10-11T14:16:45.231847Z" 611 | }, 612 | "id": "lButqpZMHOY-" 613 | }, 614 | "source": [ 615 | "def gelu(x):\n", 616 | " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))" 617 | ], 618 | "execution_count": null, 619 | "outputs": [] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": { 624 | "id": "1lNlLYXHHOZF" 625 | }, 626 | "source": [ 627 | "## 缩放点乘注意力计算\n", 628 | "\n", 629 | "- $self-att(Q,K,V) = V \\cdot softmax(\\frac{K^T \\cdot Q}{\\sqrt{D_k}}$)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "metadata": { 635 | "ExecuteTime": { 636 | "end_time": "2020-10-11T14:16:46.062685Z", 637 | "start_time": "2020-10-11T14:16:46.053866Z" 638 | }, 639 | "id": "zRBsXE3vHOZH" 640 | }, 641 | "source": [ 642 | " class ScaledDotProductAttention(nn.Module): \n", 643 | " \"\"\"\n", 644 | " Scaled Dot-Product Attention\n", 645 | " \"\"\"\n", 646 | " def __init__(self):\n", 647 | " super(ScaledDotProductAttention, self).__init__()\n", 648 | " \n", 649 | " def forward(self, Q, K, V, attn_mask):\n", 650 | " \"\"\"\n", 651 | " Args:\n", 652 | " Q: [batch_size, n_heads, seq_len, d_k]\n", 653 | " K: [batch_size, n_heads, seq_len, d_k]\n", 654 | " V: [batch_size, n_heads, seq_len, d_k]\n", 655 | " Return:\n", 656 | " self-attention后的张量,以及attention张量\n", 657 | " \"\"\"\n", 658 | " # [batch_size, n_heads, seq_len, d_k] * [batch_size, n_heads, d_k, seq_len] = [batch_size, n_heads, seq_len, seq_len]\n", 659 | " score = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)\n", 660 | " \n", 661 | " # mask==0 is PAD token\n", 662 | " # 我们需要防止解码器中的向左信息流来保持自回归属性。 通过屏蔽softmax的输入中所有不合法连接的值(设置为-∞)\n", 663 | " score = score.masked_fill_(attn_mask, -1e9) # mask==0的内容填充-1e9,使得计算softmax时概率接近0\n", 664 | " \n", 665 | " attention = F.softmax(score, dim = -1) # [bz, n_hs, seq_len, seq_len]\n", 666 | " context = torch.matmul(attention, V) # [batch_size, n_heads, seq_len, d_k]\n", 667 | " \n", 668 | " return context" 669 | ], 670 | "execution_count": null, 671 | "outputs": [] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": { 676 | "id": "JlRcTyfzHOZQ" 677 | }, 678 | "source": [ 679 | "## Multi-Head Attention" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "metadata": { 685 | "ExecuteTime": { 686 | "end_time": "2020-10-11T14:16:47.009304Z", 687 | "start_time": "2020-10-11T14:16:46.994583Z" 688 | }, 689 | "id": "-8xKRvdBHOZS" 690 | }, 691 | "source": [ 692 | "class MultiHeadAttention(nn.Module):\n", 693 | " def __init__(self):\n", 694 | " super(MultiHeadAttention, self).__init__()\n", 695 | " self.W_Q = nn.Linear(d_model, d_k * n_heads) # 其实就是[d_model, d_model]\n", 696 | " self.W_K = nn.Linear(d_model, d_k * n_heads)\n", 697 | " self.W_V = nn.Linear(d_model, d_v * n_heads)\n", 698 | "\n", 699 | " def forward(self, Q, K, V, attn_mask): # Q和K: [batch_size, seq_len, d_model], V: [batch_size, seq_len, d_model], attn_mask: [batch_size, seq_len, seq_len]\n", 700 | " residual, batch_size = Q, Q.size(0)\n", 701 | " # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n", 702 | " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s: [batch_size, n_heads, seq_len, d_k]\n", 703 | " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) # k_s: [batch_size, n_heads, seq_len, d_k]\n", 704 | " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) # v_s: [batch_size, n_heads, seq_len, d_v]\n", 705 | "\n", 706 | " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]\n", 707 | "\n", 708 | " # context: [batch_size, n_heads, seq_len, d_v], attn_mask: [batch_size, n_heads, seq_len, seq_len]\n", 709 | " context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", 710 | " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads, d_v]\n", 711 | " \n", 712 | " output = nn.Linear(n_heads * d_v, d_model)(context)\n", 713 | " \n", 714 | " return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]" 715 | ], 716 | "execution_count": null, 717 | "outputs": [] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": { 722 | "id": "ppWRazRTHOZY" 723 | }, 724 | "source": [ 725 | "## 前向传播\n", 726 | "\n", 727 | "- Position_wise_Feed_Forward" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "metadata": { 733 | "ExecuteTime": { 734 | "end_time": "2020-10-11T14:16:47.874045Z", 735 | "start_time": "2020-10-11T14:16:47.867372Z" 736 | }, 737 | "id": "35A60BrcHOZa" 738 | }, 739 | "source": [ 740 | "class PoswiseFeedForwardNet(nn.Module): # 前向传播,线性激活再线性\n", 741 | " def __init__(self):\n", 742 | " super(PoswiseFeedForwardNet, self).__init__()\n", 743 | " self.fc1 = nn.Linear(d_model, d_ff)\n", 744 | " self.fc2 = nn.Linear(d_ff, d_model)\n", 745 | "\n", 746 | " def forward(self, x):\n", 747 | " # [batch_size, seq_len, d_model] -> [batch_size, seq_len, d_ff] -> [batch_size, seq_len, d_model]\n", 748 | " return self.fc2(gelu(self.fc1(x)))" 749 | ], 750 | "execution_count": null, 751 | "outputs": [] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": { 756 | "id": "ZcC2rYvmHOZj" 757 | }, 758 | "source": [ 759 | "## 编码层EncoderLayer\n", 760 | "\n", 761 | "源码中 `Bidirectional Encoder = Transformer (self-attention)`\n", 762 | "\n", 763 | "`Transformer = MultiHead_Attention + Feed_Forward with sublayer connection`,下面代码省去了sublayer。" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "metadata": { 769 | "ExecuteTime": { 770 | "end_time": "2020-10-11T14:16:48.545383Z", 771 | "start_time": "2020-10-11T14:16:48.538822Z" 772 | }, 773 | "id": "Ut0dr87YHOZl" 774 | }, 775 | "source": [ 776 | "class EncoderLayer(nn.Module): #多头注意力和前向传播的组合\n", 777 | " def __init__(self):\n", 778 | " super(EncoderLayer, self).__init__()\n", 779 | " self.enc_self_attn = MultiHeadAttention()\n", 780 | " self.pos_ffn = PoswiseFeedForwardNet()\n", 781 | "\n", 782 | " def forward(self, enc_inputs, enc_self_attn_mask):\n", 783 | " enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n", 784 | " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]\n", 785 | " return enc_outputs" 786 | ], 787 | "execution_count": null, 788 | "outputs": [] 789 | }, 790 | { 791 | "cell_type": "markdown", 792 | "metadata": { 793 | "id": "fisjyyYVHOZx" 794 | }, 795 | "source": [ 796 | "## BERT模型" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "metadata": { 802 | "ExecuteTime": { 803 | "end_time": "2020-10-11T14:16:49.113277Z", 804 | "start_time": "2020-10-11T14:16:49.098177Z" 805 | }, 806 | "id": "zEmwiYC0HOZz" 807 | }, 808 | "source": [ 809 | "class BERT(nn.Module):\n", 810 | " def __init__(self):\n", 811 | " super(BERT, self).__init__()\n", 812 | " self.embedding = BertEmbedding()\n", 813 | " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n", 814 | " self.fc = nn.Sequential(\n", 815 | " nn.Linear(d_model, d_model),\n", 816 | " nn.Dropout(0.5),\n", 817 | " nn.Tanh(),\n", 818 | " )\n", 819 | " self.classifier = nn.Linear(d_model, 2)\n", 820 | " self.linear = nn.Linear(d_model, d_model)\n", 821 | " self.activ2 = gelu\n", 822 | " # fc2 is shared with embedding layer\n", 823 | " embed_weight = self.embedding.tok_embed.weight \n", 824 | " self.fc2 = nn.Linear(d_model, vocab_size, bias=False)\n", 825 | " self.fc2.weight = embed_weight\n", 826 | "\n", 827 | " # input_ids和segment_ids的shape[batch_size, seq_len],masked_pos的shape是[batch_size, max_pred]\n", 828 | " def forward(self, input_ids, segment_ids, masked_pos): \n", 829 | " output = self.embedding(input_ids, segment_ids) # [bach_size, seq_len, d_model]\n", 830 | "\n", 831 | " enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)# [batch_size, seq_len, seq_len]\n", 832 | " for layer in self.layers: # 这里对layers遍历,相当于源码中多个transformer_blocks\n", 833 | " output = layer(output, enc_self_attn_mask) # output: [batch_size, seq_len, d_model]\n", 834 | "\n", 835 | " # it will be decided by first token(CLS)\n", 836 | " h_pooled = self.fc(output[:, 0]) # [batch_size, d_model]\n", 837 | " logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext\n", 838 | "\n", 839 | " masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]\n", 840 | " h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]\n", 841 | " h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]\n", 842 | " logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]\n", 843 | " \n", 844 | " # logits_lm: [batch_size, max_pred, vocab_size], logits_clsf: [batch_size, 2]\n", 845 | " return logits_lm, logits_clsf " 846 | ], 847 | "execution_count": null, 848 | "outputs": [] 849 | }, 850 | { 851 | "cell_type": "markdown", 852 | "metadata": { 853 | "id": "WU01ritBHOZ7" 854 | }, 855 | "source": [ 856 | "## 定义模型" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "metadata": { 862 | "ExecuteTime": { 863 | "end_time": "2020-10-11T14:16:50.299667Z", 864 | "start_time": "2020-10-11T14:16:49.733700Z" 865 | }, 866 | "id": "T08whxgSHOZ8" 867 | }, 868 | "source": [ 869 | "model = BERT()\n", 870 | "criterion = nn.CrossEntropyLoss()\n", 871 | "optimizer = optim.Adadelta(model.parameters(), lr=0.001)" 872 | ], 873 | "execution_count": null, 874 | "outputs": [] 875 | }, 876 | { 877 | "cell_type": "markdown", 878 | "metadata": { 879 | "id": "psNxVxzQHOaF" 880 | }, 881 | "source": [ 882 | "# 训练模型" 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "metadata": { 888 | "ExecuteTime": { 889 | "end_time": "2020-10-11T14:17:14.367211Z", 890 | "start_time": "2020-10-11T14:16:50.501600Z" 891 | }, 892 | "id": "jIeAVzETHOaG", 893 | "outputId": "8de8420e-1b36-43d1-f976-25cb1f9f35de" 894 | }, 895 | "source": [ 896 | "for epoch in range(50):\n", 897 | " for input_ids, segment_ids, masked_tokens, masked_pos, isNext in dataloader:\n", 898 | " \n", 899 | " # logits_lm: [batch_size, max_pred, vocab_size]\n", 900 | " # logits_clsf: [batch_size, 2]\n", 901 | " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos) \n", 902 | " \n", 903 | " loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # for masked LM\n", 904 | " loss_lm = (loss_lm.float()).mean()\n", 905 | " \n", 906 | " loss_clsf = criterion(logits_clsf, isNext) # for sentence classification\n", 907 | " loss = loss_lm + loss_clsf\n", 908 | " \n", 909 | " if (epoch + 1) % 10 == 0:\n", 910 | " print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))\n", 911 | " \n", 912 | " optimizer.zero_grad()\n", 913 | " loss.backward()\n", 914 | " optimizer.step()" 915 | ], 916 | "execution_count": null, 917 | "outputs": [ 918 | { 919 | "output_type": "stream", 920 | "text": [ 921 | "Epoch: 0010 loss = 1.908749\n", 922 | "Epoch: 0020 loss = 1.354349\n", 923 | "Epoch: 0030 loss = 1.131212\n", 924 | "Epoch: 0040 loss = 1.091269\n", 925 | "Epoch: 0050 loss = 0.891469\n" 926 | ], 927 | "name": "stdout" 928 | } 929 | ] 930 | }, 931 | { 932 | "cell_type": "markdown", 933 | "metadata": { 934 | "id": "a-veXTwwHOaM" 935 | }, 936 | "source": [ 937 | "# 预测" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "metadata": { 943 | "ExecuteTime": { 944 | "end_time": "2020-10-11T14:19:14.300430Z", 945 | "start_time": "2020-10-11T14:19:14.294011Z" 946 | }, 947 | "id": "K8LHxRe6HOaN", 948 | "outputId": "090a7028-9df7-4b05-d18a-7013c47700d0" 949 | }, 950 | "source": [ 951 | "input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[1]\n", 952 | "print(text)\n", 953 | "print('================================')\n", 954 | "print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])" 955 | ], 956 | "execution_count": null, 957 | "outputs": [ 958 | { 959 | "output_type": "stream", 960 | "text": [ 961 | "Hello, how are you? I am Romeo.\n", 962 | "Hello, Romeo My name is Juliet. Nice to meet you.\n", 963 | "Nice to meet you too. How are you today?\n", 964 | "Great. My baseball team won the competition.\n", 965 | "Oh Congratulations, Juliet\n", 966 | "Thank you Romeo\n", 967 | "Where are you going today?\n", 968 | "I am going shopping. What about you?\n", 969 | "I am going to visit my grandmother. she is not very well\n", 970 | "================================\n", 971 | "['[CLS]', 'great', 'my', 'baseball', 'team', 'won', '[MASK]', 'competition', '[SEP]', 'i', 'am', 'going', 'to', 'visit', 'my', 'grandmother', '[MASK]', 'is', 'not', 'very', 'well', '[SEP]']\n" 972 | ], 973 | "name": "stdout" 974 | } 975 | ] 976 | }, 977 | { 978 | "cell_type": "code", 979 | "metadata": { 980 | "ExecuteTime": { 981 | "end_time": "2020-10-11T14:36:53.929639Z", 982 | "start_time": "2020-10-11T14:36:53.831314Z" 983 | }, 984 | "id": "p5hYK0CAHOaW", 985 | "outputId": "debd5cb6-9c45-4e93-8dda-1a3d11e66d03" 986 | }, 987 | "source": [ 988 | "logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), torch.LongTensor([segment_ids]), \n", 989 | " torch.LongTensor([masked_pos])) # batch=1\n", 990 | "# vocab_size维上求max, 输出最大值的索引,第一个batch的max_pred个输出\n", 991 | "logits_lm = logits_lm.data.max(2)[1][0].data.numpy()\n", 992 | "print('masked tokens list: ', [pos for pos in masked_tokens if pos != 0])\n", 993 | "print('predict masked tokens list: ', [pos for pos in logits_lm if pos != 0])" 994 | ], 995 | "execution_count": null, 996 | "outputs": [ 997 | { 998 | "output_type": "stream", 999 | "text": [ 1000 | "masked tokens list: [4, 20, 23]\n", 1001 | "predict masked tokens list: [26, 20, 23]\n" 1002 | ], 1003 | "name": "stdout" 1004 | } 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "code", 1009 | "metadata": { 1010 | "ExecuteTime": { 1011 | "end_time": "2020-10-11T14:37:00.270089Z", 1012 | "start_time": "2020-10-11T14:37:00.262266Z" 1013 | }, 1014 | "id": "crYtY_iiHOah", 1015 | "outputId": "e329e8ca-4248-459d-90df-ea168743835d" 1016 | }, 1017 | "source": [ 1018 | "pred = logits_clsf.data.max(1)[1].data.numpy()[0]\n", 1019 | "print('isNext : ', True if isNext else False)\n", 1020 | "print('predict isNext :', True if pred else False)" 1021 | ], 1022 | "execution_count": null, 1023 | "outputs": [ 1024 | { 1025 | "output_type": "stream", 1026 | "text": [ 1027 | "isNext : False\n", 1028 | "predict isNext : False\n" 1029 | ], 1030 | "name": "stdout" 1031 | } 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "metadata": { 1037 | "id": "hw3ZYr8PHOas" 1038 | }, 1039 | "source": [ 1040 | "" 1041 | ], 1042 | "execution_count": null, 1043 | "outputs": [] 1044 | }, 1045 | { 1046 | "cell_type": "code", 1047 | "metadata": { 1048 | "id": "gTDAED7mHOa2" 1049 | }, 1050 | "source": [ 1051 | "" 1052 | ], 1053 | "execution_count": null, 1054 | "outputs": [] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "metadata": { 1059 | "id": "FBbBrn4wHObD" 1060 | }, 1061 | "source": [ 1062 | "" 1063 | ], 1064 | "execution_count": null, 1065 | "outputs": [] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "metadata": { 1070 | "id": "ZcWaUF_jHObM" 1071 | }, 1072 | "source": [ 1073 | "" 1074 | ], 1075 | "execution_count": null, 1076 | "outputs": [] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "metadata": { 1081 | "id": "67nTNfB1HObV" 1082 | }, 1083 | "source": [ 1084 | "" 1085 | ], 1086 | "execution_count": null, 1087 | "outputs": [] 1088 | } 1089 | ] 1090 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP-Project 2 | 3 | - lstm 情感分析 pytorch 4 | -------------------------------------------------------------------------------- /nmt/en-cn/dev_mini.txt: -------------------------------------------------------------------------------- 1 | She put the magazine on the table. 她把雜誌放在桌上。 2 | Hey, what are you doing here? 嘿,你在這做什麼? 3 | Please keep this secret. 請保守這個秘密。 4 | How could things get worse? 事情怎麼變糟的? 5 | Kyoto and Boston are sister cities. 京都和波士顿是姐妹城市。 6 | Tom mostly kept to himself. 汤姆大多一个人独处。 7 | Do you like music? 你爱音乐吗? 8 | Tell me the reason why she got angry. 告诉我她为什么生气。 9 | When does the game begin? 游戏几点开始? 10 | I passed by her house yesterday. 我昨天經過她家。 11 | Are you all ready? 你們都準備好了嗎? 12 | He's not very strict about this. 他在这方面不是很严格。 13 | She is fond of singing old songs. 她喜歡唱老歌。 14 | I wish to climb Mt. Fuji again. 我希望再爬一次上富士山。 15 | More than twenty boys went there. 超過二十個男孩去了那裡。 16 | Any emotion, if it is sincere, is involuntary. 任何情绪,只要它是真诚的,就说明它是发自内心的自然流露。 17 | How long did you live there? 你住在那裡多久了? 18 | I don't understand German. 我不懂德语。 19 | I don't like her face. 我不喜歡她的臉。 20 | He substituted for his father. 子承父业。 21 | She plays the piano very well. 她鋼琴彈得很好。 22 | You are no better at remembering things than I am. 你记事情的能力并不比我好多少。 23 | Would you lend me some money? 你可以借我一些錢嗎? 24 | She hit the ball hard. 她用力地拍了球。 25 | Please serve him his meal first. 請先為他上菜。 26 | He had kept the secret to himself. 他保守著這個秘密。 27 | He graduated from Harvard University with honors. 他光榮地從哈佛大學畢業了。 28 | Tom started dating Mary three months ago. 汤姆三个月前开始和玛丽约会。 29 | All my homework is done. 我做完了所有的回家作業。 30 | My uncle never writes letters. 我的叔叔從來不寫信。 31 | It's too late to apologize. 现在道歉也迟了。 32 | I gave you a book. 我给了你一本书。 33 | Could you teach me how to play the piano? 您能教我弹钢琴吗? 34 | I only spent three dollars. 我只花了三美元。 35 | I'm sleepy. 我困了。 36 | You're really beautiful. 你真的很漂亮。 37 | Whether you succeed or not depends on your own efforts. 你成功与否取决于你自身的努力。 38 | That's what Tom requested. 那就是汤姆想要的。 39 | Have you been to London before? 你以前去過倫敦嗎? 40 | He doesn't speak our language. 他不會說我們的語言。 41 | I have a computer. 我有一台电脑。 42 | Turn right at the next corner. 在下一個轉角右轉。 43 | I ate caviar. 我吃了魚子醬。 44 | She works as a massage therapist. 她是按摩师。 45 | What do you usually eat for lunch? 你一般中午饭吃什么? 46 | That's a bad day for me. 那天我不行。 47 | I like sports. 我喜歡運動。 48 | She may come. 她可以來。 49 | We expect a lot from him. 我們對他期望很多。 50 | I took a picture of my family. 我為我的家人拍了照片。 51 | She greeted me with a smile. 她用一个微笑迎接了我。 52 | Every time I went to his place, he was studying. 每次我去他住處,他都在讀書。 53 | Only four horses were in the race. 只有四匹馬參加了比賽。 54 | That is your book. 那是你的書。 55 | Tom had a heart attack last year. 汤姆去年得了心脏病。 56 | Any comments are welcome. 欢迎作任何评论。 57 | She spread the butter on the bread. 她把奶油塗在麵包上。 58 | Don't make fun of people. 不要取笑人。 59 | Karuizawa is famous as a summer resort. 輕井澤是著名的避暑勝地。 60 | What's your favorite home-cooked food? 你最喜歡的家常菜是什麼? 61 | It makes me feel sad. 这让我感到沮丧。 62 | This car handles very easily. 这车容易开。 63 | I've never underestimated Tom. 我从没低估汤姆。 64 | Would you please have a look at these papers? 請你看看這些文件。 65 | His life after retirement was unhappy. 他退休後的生活不快樂。 66 | The plane made a perfect landing. 這架飛機完美的著陸了。 67 | It all depends on the weather. 一切都取決於天氣。 68 | We're very different. 我们很不一样。 69 | It is definite that he will go to America. 他肯定要去美国。 70 | I threw away my shoes. 我把自己的鞋子扔掉了。 71 | I'm so excited. 我很激动。 72 | Had he known what was about to happen, he would have changed his plan. 要是他知道会发生什么,他就会改变计划。 73 | You have nice skin. 你的皮膚真好。 74 | My heart was filled with happiness. 我心里充满着快乐。 75 | I didn't see him. 我没见到他。 76 | My parents usually speak to each other in French, even though my mother is a native English speaker. 我父母通常用法语对话,即使我母亲的母语是英语。 77 | Which do you like better, spring or autumn? 春天和秋天,你更喜欢哪个? 78 | You've been late for school more often than before. 你比以前更容易上課遲到了。 79 | You're partially correct. 你部分正确。 80 | I will give you whatever you want. 我會給你任何你想要的東西。 81 | Women tend to live longer than men. 女人往往比男人活得更長。 82 | They don't seem to be Americans. 他們似乎不是美國人。 83 | She laid the work on him. 她派他去工作了。 84 | They were listening to the radio. 他們在聽收音機。 85 | I always thought that Tom was a bit different. 我以前总认为汤姆是有一些与众不同的。 86 | He can speak French, and obviously English. 他能说法语,很明显还有英语。 87 | Everyone admired his courage. 每個人都佩服他的勇氣。 88 | He did not think he needed their protection. 他认为他不需要他们的保护。 89 | What are you staring at? 你在看什麼? 90 | Give him an inch and he'll take a yard. 得寸进尺。 91 | The cough syrup has a licorice flavoring. 咳嗽糖浆有股甘草的味道。 92 | The meat has gone bad. 這肉已經壞了。 93 | Six divided by two is three. 六除以二得三。 94 | Ask him if he can speak Japanese. 问问他会不会说日语。 95 | If you see a mistake, then please correct it. 如果你发现错误,那就请你纠正它。 96 | What time is it? 几点了? 97 | Come here by ten at the latest. 最晚十點前來這裡。 98 | Why did you show me this? 你为什么让我看这个? 99 | I'm free tonight. 我今晚有空。 100 | I want to be the one who decides. 我想成为决策的人。 101 | Tom has been struck by lightning three times. 湯姆被閃電擊中過三次。 102 | Here is your book. 這是你的書。 103 | Tom, I want to have a chat with you. Tom,我想和你談談。 104 | Tom wrote his name in the sand with a stick. 汤姆用棍子在沙上写了他的名字。 105 | It's as cold as ice. 它冷得像冰一樣。 106 | My brother gave me a cute doll. 我哥哥給了我一個可愛的娃娃。 107 | I was a student at that time. 我当时是学生。 108 | It was not long before we met again by chance. 没多久,我们又碰巧遇到了。 109 | Everybody will die. 人固有一死。 110 | She had the nerve to speak out. 她竟敢說出來。 111 | She was holding a small parasol in her hand. 她手里握着一把小阳伞。 112 | He passed the entrance examination. 他通過了入學考試。 113 | A friend of mine is studying abroad. 我有一位朋友在國外留學。 114 | I get along with my younger brother. 我與我的弟弟相處融洽。 115 | His house is somewhere about here. 他家在这儿某处。 116 | This ticket is good for three days. 這張票的有效期是三天。 117 | Do you have any smaller sizes? 你有任何比較小的尺寸嗎? 118 | I gave careful consideration to the problem. 我仔細地考慮了這個問題。 119 | Answer the question. 回答問題。 120 | I must help my mother. 我必須幫忙我母親。 121 | What's your home address? 你家的地址是什麼? 122 | The prince fell in love with a woodcutter's daughter. 王子愛上了一個樵夫的女兒。 123 | I think you're a really nice guy. 我認為你真的是一個好人。 124 | A burglar broke into his house. 一個竊賊闖進了他的房子。 125 | Jealousy was the motive for the murder. 嫉妒是謀殺的動機。 126 | My sister became a college student. 我妹妹成為了一個大學生。 127 | They are at lunch. 他们在吃午饭。 128 | You should know that's impossible. 你应该知道这是不可能的。 129 | I forgot it in the garage. 我把它忘在车库里了。 130 | I do not allow sleeping in class. 我不允许有人在课上睡觉。 131 | Tom has a lot of experience in computers. 汤姆对电脑有很多经验。 132 | It cost me ten thousand yen to have my television set repaired. 把我的電視機修好花了我一萬日元。 133 | I made some mistakes on the test. 我在考试时犯了些错。 134 | I have already finished the job. 我已經完成了這項工作。 135 | Do you have something to say? 您有什么事要说吗? 136 | They sank ten enemy ships. 他们使10艘敌船沉了 137 | Everything went according to plan. 一切按計劃進行。 138 | Is one thousand yen enough? 1000日元够不够? 139 | Move the chair nearer to the desk. 把椅子挪一挪靠近桌子。 140 | He had no luck in finding work. 他不幸找不到工作。 141 | This chair is ugly. 這把椅子很醜。 142 | He doesn't realise that he's tone deaf. 他不知道他自己五音不全。 143 | He barely passed the examination. 他勉強地通過了考試。 144 | I don't want any excuses. 我不想听解释。 145 | Don't waste your time. 别浪费时间。 146 | He looks a bit tired, doesn't he? 他看起來有點累,不是嗎? 147 | My brother is a high school student. 我哥哥是個高中生。 148 | It's pretty heavy. 它真重。 149 | Tom didn't write back to Mary. 汤姆没给玛丽写回复。 150 | The meaning of this sentence is obscure. 这句句子意思模糊。 151 | We'll be in Boston for another three weeks. 我們還會在波士頓待三個月。 152 | It would take forever for me to explain everything. 要都解释的话,需要一辈子的时间。 153 | It would take me too much time to explain to you why it's not going to work. 给你解释这为什么行不通要花很多时间。 154 | I try not to think about it. 我試著不去想了。 155 | I don't worry about the risk. 我不担心风险。 156 | Tom was there physically, but not mentally. 汤姆人在心不在。 157 | Is that better? 那更好吗? 158 | The United States borders Canada. 美国与加拿大相邻。 159 | Let's take a ten-minute break. 讓我們休息10分鐘。 160 | I'm as hungry as a horse. 我餓得像匹馬。 161 | So you give up, right? 所以你放弃了,是吗? 162 | I'd like a glass of water, please. 請給我一杯水。 163 | I hear they're pretty good. 我听说他们挺好。 164 | I would like fruit juice. 我想要果汁。 165 | He's accustomed to traveling. 他習慣了旅行。 166 | I started thinking about Tom. 我开始想起汤姆。 167 | Many attended his funeral. 很多人都参加了他的葬礼。 168 | My baby began crying, asking for milk. 我的宝宝开始哭了,他想要吃奶。 169 | Don't walk alone after dark. 不要一個人在黑暗中走。 170 | He ate all of the apple. 他吃了所有的蘋果。 171 | We'll save a seat for you. 我们会给你留个位置。 172 | Training will be provided. 会有训练。 173 | It was been raining since early morning. 從清晨開始一直下雨。 174 | My parents' generation went through the war. 我父母那一代经历过战争。 175 | Please be quiet, everybody. 请大家都保持安静。 176 | His little brother is a famous soccer player. 他弟弟是个有名的足球选手。 177 | Let's go by bus. 讓我們坐公共汽車去。 178 | What's that bird called? 那隻鳥叫什麼名字? 179 | Would you like to come? 你愿意来吗? 180 | We want to hear it. 我们想听听。 181 | I'm planning to study tonight. 我打算今天晚上讀書。 182 | There was scarcely any money left. 几乎没剩下钱。 183 | You can drive a car, can't you? 你會開車,不是嗎? 184 | Tears rolled down my cheeks. 泪水沿着我的面颊流了下来。 185 | He suddenly became very happy. 他突然變得非常開心。 186 | I am sixteen years old. 我16岁了。 187 | I had a nice chat with her. 我和她聊得很愉快。 188 | I tried to convince Tom to come home. 我试着说服汤姆回家。 189 | I don't know how to handle children. 我不知道如何對待孩子。 190 | He has been to many places. 他去过很多地方。 191 | I put the money into the safe. 我把錢放入保險櫃裡。 192 | Someone is watching you. 有人在看著你。 193 | Nobody had ever heard of it. 以前從來沒有人聽說過。 194 | "He'd like to have a coffee after work." "I would too." "他想在下班後喝杯咖啡。" "我也想。" 195 | There are not many books on these shelves. 这些书架上没有很多书。 196 | Spring is my favorite season. 春天是我最喜爱的季节。 197 | You don't go to school on Sunday, do you? 你週日不上學, 對嗎? 198 | They referred to Chaucer as the father of English poetry. 他們視喬叟為英詩之父。 199 | He said nothing, which made her angry. 他什么也没说,这让她很生气。 200 | I slapped his face. 我摑了他的臉。 201 | -------------------------------------------------------------------------------- /nmt/en-cn/test_mini.txt: -------------------------------------------------------------------------------- 1 | He knows better than to marry her. 他聰明到不會娶她。 2 | He had hoped to succeed, but he didn't. 他本希望可以成功,但是他没有。 3 | This is the worst movie I have ever seen. 这是我看过的最差劲的电影了。 4 | She's in the bath. 她在浴室。 5 | Man is the only animal that can make use of fire. 人是唯一会使用火的动物。 6 | Every one of her songs was a hit. 她的每首歌都长期备受欢迎。 7 | He said that it would probably rain. 他說很可能會下雨。 8 | I wonder if I hurt Tom's feelings. 我不知道我是不是伤害了汤姆的感情。 9 | You chose this job yourself, right? 你自己选择了这份工作,是吗? 10 | Nobody is too old to learn. 沒有人會因為太老而不能學習。 11 | He drove the car, listening to music on the radio. 他一面開車一面聽收音機的音樂。 12 | I saw it with my own eyes. 我親眼看見了。 13 | What should I bring? 我该带些什么? 14 | He is no fool. 他没疯。 15 | She went there yesterday. 她昨天去那裡。 16 | What's the forecast for tomorrow? 明天的天气预报怎么说? 17 | She felt blue. 她感到闷闷不乐的。 18 | He climbed the stairs. 他爬上了樓梯。 19 | I told Tom I was OK. 我对汤姆说我很好。 20 | Were you in America last month? 你上個月在美國嗎? 21 | Let's synchronize our watches. 讓我們校對一下錶吧。 22 | How much money do I owe you? 我欠你多少錢? 23 | I am in the habit of taking some exercise before breakfast. 我在吃早餐前有運動的習慣。 24 | We enjoyed ourselves at the party. 我們在派對上玩得很開心。 25 | I just heard something. 我只是听到了一些消息。 26 | Where did you get your camera repaired? 你在哪儿修的照相机? 27 | I am from Shikoku. 我來自四國。 28 | You broke the rules. 你触犯了规则。 29 | He's my new friend. 他是我新交的朋友。 30 | Don't throw out this magazine. I haven't read it yet. 这本杂志不要扔。我还没看呢。 31 | I'd like to work at the cafeteria. 我想在餐廳工作。 32 | The building is seven stories high. 這棟建築物有七層樓高。 33 | She lives in New York. 她住在纽约。 34 | Don't be shy. 不要害羞。 35 | This is the best restaurant that I know. 这是我所知道的最好的餐厅。 36 | Mother made us cheese sandwiches for lunch. 媽媽為我們做了乳酪三明治當午餐。 37 | I am a stranger here. 我是這裡的外地人。 38 | Where's the nearest train station? 最近的火车站在哪里? 39 | How are you? 你們好嗎? 40 | How about going out for lunch? 出去吃午飯怎樣? 41 | I'm doubling my prices. 我正在把价格涨一倍。 42 | She kept me waiting for 30 minutes. 她让我等了30分钟。 43 | Has she finished the book yet? 她讀完這本書了嗎? 44 | I called her up. 我叫她起来。 45 | I'm still angry about that. 我还是为那生气。 46 | Tom picked up the knife. 汤姆拿起了刀。 47 | He must be about forty. 他应该40岁左右吧。 48 | We want to clear up this problem. 我们像澄清这个问题 49 | Many Asians have English as a common language. 許多亞洲人以英語作為共用的語言。 50 | More often than not, she had to go in person. 很多時候她要親自去。 51 | He couldn't hold his temper any longer. 他再也忍不住气了。 52 | I want him to read this. 想让他读这个。 53 | Tom knew how to do that. 汤姆知道怎么做那事。 54 | I want to go wherever you're going. 我想去你要去的地方。 55 | Clean your room. 清掃你的房間。 56 | He grew up to be an engineer. 他長大後成為了一名工程師。 57 | She came from Canada to see me. 她從加拿大來看我。 58 | I was struck by lightning. 我被雷劈了。 59 | Please drive the car more slowly. 請開車開慢一點。 60 | Let's not argue. 我們別吵了。 61 | Let me think. 讓我想一想。 62 | Please feed the dog every day. 请每天喂一下狗。 63 | Tom's eyes were glued to the screen. Tom的眼睛被荧幕吸引住了。 64 | Do you eat it in the classroom? 您在教室裡吃它嗎? 65 | He cleared his throat. 他清了清喉嚨。 66 | The tiger cub looked like a large kitten. 小老虎看起來像隻大貓。 67 | This letter is wrongly addressed. 这封信寄错地方了。 68 | What's wrong, honey? 出什么事了,宝贝? 69 | It's your turn. 輪到你了。 70 | Those are their books. 这是他们的书。 71 | Try it. 试试吧。 72 | Don't change your mind. 不要改變你的心意。 73 | I just cut my finger. 我剛剛切到手指了。 74 | I am getting off at the next station. 我下一站下车。 75 | He got up at five as usual. 他像往常一樣五點鐘起床。 76 | These three pretty girls are all nieces of mine. 这三个漂亮的女孩都是我的侄女。 77 | I am glad that you have succeeded. 我很高兴你们成功了。 78 | He lived abroad for many years. 他居住在國外多年。 79 | Another bottle of wine, please. 麻煩再一瓶葡萄酒。 80 | He asked me two questions. 他問了我兩個問題。 81 | I have finished cleaning my room. 我已經打掃完我的房間了。 82 | Don't let him bite you. 別讓他咬你。 83 | They made fun of Mary. 他們取笑瑪麗。 84 | I think it's time for you to grow up. 我觉得你是时候该懂事了。 85 | I asked him what his name was. 我問了他叫什麼名字。 86 | Feel free to ask any questions. 隨時問任何問題都可以。 87 | Speaking English is useful. 说英语很有用。 88 | She didn't like her husband. 她不喜歡她的丈夫。 89 | He complained about the noise. 他抱怨這個噪音。 90 | You'll never be alone. 你们永远不会一个人的。 91 | If you hurry, you will catch up with him. 如果你快一点,还能赶上他。 92 | It was dark when I reached the hotel. 我到旅馆的时候,天已经黑了。 93 | All of my friends got asked to dance except me. 除了我,其他人都被邀请去跳舞了。 94 | Where are my books? 我的書在哪? 95 | She's collecting material for a book. 她为一本书收集材料。 96 | He belongs to the camera club. 他參加攝影社。 97 | He took off his overcoat. 他脫掉了大衣。 98 | I swear I didn't do anything. 我发誓我什么也没做。 99 | Cut the chit-chat and get to work. 別閒聊了,開始工作。 100 | He works from nine to five-thirty. 他從早上九點工作到下午五點半。 101 | I think you should take the test again. 我的意见是,你该再次参加测试。 102 | We have to put off the game till next Sunday. 我们不得不把游戏搁到下周日了。 103 | Eat your soup while it is hot. 趁热喝你的汤。 104 | There are almost no books. 幾乎沒有任何書籍。 105 | I have a big dog. 我有条大狗。 106 | Have you finished breakfast yet? 你吃完早飯了嗎? 107 | The phone rang. 電話正在響。 108 | My uncle had me act as a translator. 我叔叔讓我擔任翻譯。 109 | I don't know where he lives. 我不知道他住在哪裡。 110 | I'm a big football fan. 我是足球的忠实粉丝。 111 | There is little water in the pond. 這個池塘裡的水很少。 112 | When did you begin studying English? 您什么时候开始学英语的? 113 | That architect builds very modern houses. 那个建筑师创建了非常现代的房屋。 114 | Can anyone drive? 有人会开车吗? 115 | He saluted the lady. 他向那位女士问好。 116 | He does speak well. 他真的說得很好。 117 | I have no particular reason to do so. 我没有特别的理由去做。 118 | Could you do me a favor? 請你幫我一個忙好嗎? 119 | The space race was an exciting time in history. 太空跑步是历史上激动人心的时刻。 120 | We are at school together. 我们一起在学校。 121 | They called off their engagement. 他們解除了婚約。 122 | Tom is a true man. 汤姆是个真男人。 123 | English is a universal language and is used all over the world. 英語是一種世界性的語言,用於世界各地。 124 | Tom watched TV yesterday. Tom昨天看了电视。 125 | It's not practical. 这不实用。 126 | We have a great team. 我们有个好极了的团队。 127 | No less than three hundred dollars was needed for the work. 这个工作需要不低于300美金。 128 | Beware of the dog! 小心狗! 129 | This is a strange sentence. 這是一個奇怪的句子。 130 | Do you study every day? 你每天都学习吗? 131 | Furniture made of good materials sells well. 用优质材料做的家具卖得很好。 132 | I'm not accustomed to getting up early. 我不習慣早起。 133 | Do you speak Chinese? 你會說中文嗎? 134 | School starts in September in Europe. 歐洲的學校在九月開學。 135 | My grandson is still a baby. 我的孫子還是個嬰兒。 136 | The button came off. 這顆鈕扣脫落了。 137 | He had his hair cut short. 他把头发剪短了。 138 | I'll see you next Wednesday. 我下星期三见你。 139 | He treats me like his slave. 他對待我就像他的奴隸。 140 | She doesn't live there any more. 她不再住在那裡了。 141 | You may choose any of them. 你可以選擇他們之中的任何一個。 142 | She's not as old as Mary. 她沒有瑪麗年紀大。 143 | If you pass this test, you could graduate next month. 如果你通過這個考試,你下個月就可以畢業了。 144 | Neptune is the eighth planet of the solar system. 海王星是太阳系第八个行星。 145 | May I put it down here? 我可以把它放在这儿吗? 146 | Jesus loves you. 耶穌愛你。 147 | That's the point. 这正是问题的关键。 148 | My father died four years ago. 我的父親四年前去世了。 149 | A dolphin is a mammal. 海豚是哺乳動物。 150 | I don't like kids. 我不喜欢小孩。 151 | I recognized your voice right away. 我立刻認出了你的聲音。 152 | Tom used to work here. 汤姆过去在这里工作。 153 | You must clear the table. 你必须把桌子清理干净。 154 | The school needed a new teacher. 學校需要一個新的老師。 155 | Can you do that by yourself? 你自己一個人能做嗎? 156 | I'm really scared of spiders. 我真怕蜘蛛。 157 | I know that she is Spanish. 我知道她是西班牙人。 158 | I suppose I'd better phone Tom. 我想我给汤姆打电话比较好。 159 | He paid 1,000 yen for this book. 他花了1000日元買這本書。 160 | The fire alarm rang. 火警警报响了。 161 | You have already eaten the cake. 你已经把蛋糕吃了。 162 | Which is your book? 哪本是你們的書? 163 | I did not expect it to be that big. 我没想到它有那么大。 164 | I have ten pens more than you do. 我比你多10支钢笔。 165 | Let's get together again! 讓我們再聚在一起! 166 | According to newspaper reports, there was an airplane accident last evening. 根據報載,有一架飛機昨天晚上發生了意外。 167 | The police assembled a lot of evidence against him. 警察收集了很多对他不利的证据。 168 | Has something good happened? 發生了什麼好事嗎? 169 | A stitch in time saves nine. 小洞及時補,免遭大洞苦。 170 | Tom doesn't have any shoes on. 汤姆没穿鞋。 171 | I'm not sure. 我不确定。 172 | I want to buy this dictionary. 我想買這本字典。 173 | I've gotten better. 我已經變得好多了。 174 | The job is half done. 這項工作已經完成了一半。 175 | Tom told me that he doesn't like carbonated drinks. 汤姆告诉我他不喜欢碳酸饮料。 176 | Tom was unsatisfied with the results. 汤姆对结果不满。 177 | He died soon after the accident. 他在事故後不久就去世了。 178 | Would you please open the door? 請你開門好嗎? 179 | What I want is some peace and quiet. 我所想要的是一点平和和安静。 180 | What time did he say he'd come? 他說他幾點鐘會來? 181 | This meal is adequate for three. 这饭足够三个人吃。 182 | A devastating earthquake hit the state capital. 一場毀滅性的地震襲擊了這個州的首府。 183 | You shouldn't talk back to your parents like that. 你不應該對你父母那樣頂嘴。 184 | I need to know what you think. 我需要知道你怎么想。 185 | Facebook is blocked in China. Facebook在中國是被封鎖的。 186 | Pretend you're me. 假装你是我。 187 | I haven't finished this. 我做不了這個。 188 | Tom came to see if Mary needed any help. 汤姆过来看看玛丽有没有什么需要帮忙的。 189 | You lied to me, didn't you? 你對我說了謊, 沒有嗎? 190 | I'm always forgetting people's names. 我總是忘記別人的名字。 191 | Isn't that mine? 那是我的吗? 192 | "The good die young" is an old saying which may or may not be true. “好人不长命”是句或真或假的老话。 193 | Don't underestimate your own strength. 不要低估自己的实力。 194 | He kept a diary during the trip. 他旅行期间,写了旅游日记。 195 | A true scientist would not approach the question this way. 真正的科学家不会这样去思考。 196 | I'll see to it. 由我来做. 197 | Tom nodded approval. 汤姆点头同意。 198 | Tom denied having stolen the money. Tom否认偷了钱。 199 | Are you going to visit any other countries? 你会去访问其他国家吗? 200 | He wrote to me yesterday. 昨天他寫信給我。 201 | -------------------------------------------------------------------------------- /nmt/en-cn/train_mini.txt: -------------------------------------------------------------------------------- 1 | Anyone can do that. 任何人都可以做到。 2 | How about another piece of cake? 要不要再來一塊蛋糕? 3 | She married him. 她嫁给了他。 4 | I don't like learning irregular verbs. 我不喜欢学习不规则动词。 5 | It's a whole new ball game for me. 這對我來說是個全新的球類遊戲。 6 | He's sleeping like a baby. 他正睡着,像个婴儿一样。 7 | He can play both tennis and baseball. 他既会打网球,又会打棒球。 8 | We should cancel the hike. 我們應該取消這次遠足。 9 | He is good at dealing with children. 他擅長應付小孩子。 10 | She will do her best to be here on time. 她会尽量按时赶来的。 11 | Why are you so good at cooking? 为什么你做饭那么拿手呢? 12 | He has recovered from his bad cold. 他从重感冒中恢复了过来。 13 | It's a dead end. 这是个死胡同。 14 | I rejected the offer. 我拒绝了报价。 15 | He often quotes Milton. 他常引用米爾頓。 16 | Mommy, may I go swimming? 媽咪,我可以去游泳嗎? 17 | Miyazaki is not what it used to be. 宮崎不是它往日的樣子了。 18 | People must love one another. 人要爱他人。 19 | Where do you have pain? 你哪裡痛? 20 | Keep oil away from the fire. 讓油遠離火。 21 | This is never going to end. 这将永远继续下去。 22 | He had a hungry look. 他面有飢色 23 | I don't know which button to push. 我不知道要按哪個按鈕。 24 | He heard the news on the radio. 他從收音機聽到了這個消息。 25 | Don't leave the water running. 不要讓水一直流。 26 | I'm opposed to any type of war. 我反對任何形式的戰爭。 27 | I had a hard day. 我过了难挨的一天。 28 | He will succeed to the throne. 他会继承王位。 29 | Please give me something hot to drink. 請給我一些熱的東西喝。 30 | Have fun. 玩得開心。 31 | The library is on the second floor. 圖書館在二樓。 32 | Look at me with your books closed. 把你的書閤起來看著我。 33 | I still love him. 我依旧爱着他。 34 | His low salary prevents him from buying the house. 他的低薪水让他买不了房。 35 | Hang your coat on the hook. 把你的外套掛在鉤子上。 36 | I can't understand his feelings. 我不明白他的感受。 37 | She didn't try to hide the truth. 她没有试图掩盖真相。 38 | Dad bought me a camera. 爸爸给我买了一个照相机。 39 | I love her and she loves me. 我愛她,她也愛我。 40 | The boy has learned to read. 男孩學會了閱讀。 41 | You are irresistible. 你是不可抗拒的。 42 | It was heartless of him to say such a thing to the sick man. 他对一个生病的男人说这种事真是没良心。 43 | He calls her up every night. 他每天晚上打電話給她。 44 | They left one after another. 他们一个接着一个地离开了。 45 | Father always has the tailor make his suits. 父親總是讓這位裁縫師為他做西裝。 46 | We got him to carry our bag. 我們讓他幫我們拿袋子。 47 | He drove the truck to Dallas. 他開卡車到達拉斯。 48 | The baby smiled at me. 宝宝对我笑了。 49 | Did he propose to you? 他向你求婚了嗎? 50 | Cheese is made from milk. 奶酪是用奶做成的。 51 | I help my mother with the housework every day. 我每天都帮我妈做家务。 52 | I spoke with him about the matter. 我跟他談過這個問題。 53 | The friend who I thought would pass the exam failed it. 那个我认为会通过考试的朋友失败了。 54 | They are very big apples. 他們是非常大的蘋果。 55 | How dare you! 你敢! 56 | He must have taken the wrong train. 他一定是搭錯火車了。 57 | I understand how to solve the problem. 我明白怎么解决问题。 58 | How many children do you have? 你有幾個小孩? 59 | He no longer lives here. 他不再住在這裡了。 60 | He is the same age as me. 他和我同岁。 61 | Industry as we know it today didn't exist in those days. 我们今天所认识的工业在那个时代不存在。 62 | That boy used to drop in on me. 那男孩儿来访过我家。 63 | I think you'll have very little difficulty in getting a driver's license. 我想你要拿到驾照根本不难。 64 | Run. 你用跑的。 65 | He told me to do it, so I did it. 他让我做,我就做了。 66 | That is not my line. 这不是我拿手的。 67 | She lives in a large house. 她住在一棟大房子裡。 68 | A doctor told me that eating eggs was bad for me. 一位医生告诉过我,吃鸡蛋对我的健康有害。 69 | She asked him to open the window. 她請他打開窗口。 70 | She has a flower in her hand. 她手上有一朵花。 71 | I have just finished my homework. 我剛剛完成我的作業。 72 | You look pale today. 你今天看上去很苍白。 73 | I'm counting how many people there are. 我正在算有多少人在那裡。 74 | I felt that my honor was at stake. 我覺得我的名譽受到了威脅。 75 | Money isn't the only thing that matters. 不只是钱关系重大。 76 | Tom just never should've done that. 汤姆就不该做那事。 77 | I don't want there to be any misunderstanding. 我不想有任何误会。 78 | He may not be happy. 他可能不高兴。 79 | The hunter shot the fox dead. 獵人射殺了狐狸。 80 | He goes to school to study every day. 他每天去学校学习。 81 | I spent last Sunday reading a novel. 我上週日花時間看了一本小說。 82 | Mary closed the door quietly. 瑪麗悄悄地關上了門。 83 | You smoke far too much. You should cut back. 你吸太多煙了。你應該少抽一點。 84 | I wish I could have spoken Spanish. 要是我會說西班牙語就好了。 85 | There's nothing I wouldn't do for Tom. 没有我不会给汤姆做的事。 86 | Tom was making French fries. 汤姆想做炸薯条。 87 | I study for 3 hours every day. 我每天讀書三個小時。 88 | He was home alone at the time. 他当时一个人在家。 89 | I'm tired of listening to his boasts. 我厭倦了聽他吹噓。 90 | Because of heavy snow, the plane from Beijing arrived 20 minutes late. 由於下大雪,從北京来的班機延遲二十分鐘。 91 | You may use my new car. 你可以使用我的新車。 92 | Where is the vodka? 伏特加在哪里? 93 | If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language. 如果一個人在成人前沒有機會習得目標語言,他對該語言的認識達到母語者程度的機會是相當小的。 94 | He hurried so he wouldn't miss the train. 汤姆加紧步伐以不错过火车。 95 | Don't go out after dark. 天黑以後不要出門。 96 | They arrived here safely yesterday. 他們昨天平安抵達這裡。 97 | None of those books are useful. 這些書裡沒有一本是有用的。 98 | I just said something very stupid. 我刚说了很蠢的话。 99 | If I were you, I'd want to know what Tom is doing right now. 如果我是你,我不会想去知道Tom现在正在做什么。 100 | I often went fishing with him. 我經常和他去釣魚。 101 | I hope we find Tom. 我希望我们能找到汤姆。 102 | You used to be able to see the church from here. 你以前可以從這裡看到教堂。 103 | Can we do it? 我们能做到吗? 104 | I've never told anyone about this. 我没跟任何人说过这个。 105 | One thousand dollars will cover all the expenses for the party. 1000美元将负担聚会的全部费用。 106 | Fire is always dangerous. 火總是危險的。 107 | The meeting was called off. 会议取消了。 108 | They welcomed me warmly, so I felt at home. 他们这么热情的欢迎我,让我感觉家人一样。 109 | Tom pulled Mary out of the water. Tom 把Mary拉出水 110 | I have class tomorrow. 我明天有课。 111 | Tom fell asleep in class. 汤姆在课堂上睡着了。 112 | Tom can't get his car started. 汤姆没法发动他的车。 113 | This is a very beautiful flower. 這是一朵非常美麗的花。 114 | Best wishes from all of us. 我们所有人都祝福你。 115 | My aunt gave me a camera. 我的阿姨給了我一台攝影機。 116 | I don't understand music. 我不懂音乐。 117 | I wish I was young again. 我希望我再年輕一次。 118 | I'm not guilty of anything. 我没有对任何事感到有罪。 119 | A bird can fly. 鸟会飞。 120 | His office is near the train station. 他的辦公室離車站很近。 121 | A swarm of hornets attacked the children. 一窩黃蜂襲擊了孩子們。 122 | She stirred her coffee with a teaspoon. 她用茶匙搅她的咖啡。 123 | When are you going to quit smoking? 你何時要戒煙? 124 | He ran into the room. 他跑进房间内。 125 | What are we having for dinner? 我们晚饭吃什么? 126 | You'll get lost. 你会迷路的。 127 | My cell phone has a built-in digital camera. 我的手機有內建的數位相機。 128 | Hold your horses, young man. 別那麼猴急,年輕人。 129 | Six divided by two equals three. 六除以二得三。 130 | I am able to drive a car. 我會開車。 131 | I could answer all the questions. 我可以回答所有问题。 132 | Can you do bookkeeping? 你會記帳嗎? 133 | They're all the same size. 他們都是一樣的大小。 134 | What do you two do for fun? 你们两个做了什么有趣的事情呢? 135 | No one lives in this building. 没有人住在这栋楼里。 136 | I'm sick and tired of hamburgers. 我對漢堡感到厭煩了。 137 | They married when they were young. 他們在還很年輕的時候就結婚了。 138 | I would have liked to come with you, but I didn't have time. 我想和你一起去,但是我没有时间。 139 | Academic fraud is more common than you might think. 学术造假比你想象的普遍。 140 | That's exactly what I wanted to see happen. 这刚好是我想看到它发生的。 141 | I'm afraid there isn't any coffee left. 恐怕已經沒有咖啡了。 142 | While I was reading in bed last night, I fell asleep with the light on. 我昨晚在床上看书的时候点着灯就睡了。 143 | He'll be back home soon. 他很快就會回家。 144 | We sometimes swim in the lake. 我們偶爾在湖裡游泳。 145 | Charles Lindbergh made the first solo flight across the Atlantic Ocean in 1927. Charles Lindbergh於1927年成功完成了第一次獨自飛越大西洋。 146 | His shirt was stained with sauce. 他的衬衫被酱汁弄脏了。 147 | I am just going for a walk. 我只是去散散步。 148 | We found the front door locked. 我們發現前門被鎖上了。 149 | He is so heartless. 他是这么的无情。 150 | I'm playing a TV game. 我在玩電視遊樂器。 151 | Let's go now. 我们现在去吧。 152 | Control is everything. 控制就是一切。 153 | Swimming is easy. 游泳很容易。 154 | He held a package under his arm. 他挾著一個包裹。 155 | She looks pretty in that dress. 她穿上那件衣服看起來很漂亮。 156 | Why are you here? 你為什麼在這? 157 | She asked us to leave her alone. 她要求我們別吵她。 158 | Thanks very much for having me to dinner the other night. 谢谢那天晚上请我吃了饭。 159 | After they argued, they didn't speak to each other for a week. 他们争吵后,一周都没有再说话。 160 | Tom slipped out of the classroom. 湯姆溜出了教室。 161 | You ought to ask for your teacher's permission. 你應該請求你的老師允許。 162 | The clock has stopped. 時鐘已經停止了。 163 | When did your friend leave for America? 你朋友是什么时候出发去美国的? 164 | They grow flowers in the garden. 他們在花園裡種花。 165 | She told me an interesting story. 她给我讲了一个有趣的故事。 166 | I received her letter yesterday. 昨天我收到了她的信。 167 | I slept on the bus. 我在公交车上睡觉了。 168 | Stand aside. 一边站着。 169 | By the year 2020, the population of our city will have doubled. 在2020年以前,我們的城市的人口將增加一倍。 170 | Tom has been really busy recently. 汤姆最近相当忙。 171 | The swimmers were numb with cold. 游泳選手們凍僵了。 172 | Many moons orbit around Saturn. 許多衛星繞著土星運行。 173 | I spent the whole day in reading the novel. 我一整天都在看这本小说。 174 | Volcanic ash disrupted air travel. 火山灰阻礙航運。 175 | I'm surprised that you don't know about their marriage. 我很惊讶,你竟然不知道他们结婚。 176 | Mom, I'm hungry. 妈妈,我肚子饿了。 177 | He earned his living as a teacher. 他以當老師為生。 178 | She loves chocolate, too. 她也喜欢巧克力。 179 | It has been cold since yesterday. 從昨天開始變冷了​​。 180 | She must have been sick. 她一定是生病了。 181 | I'm not very good at swimming. 我不是很擅長游泳。 182 | Switch on the light. I can't see anything. 把灯打开。我什么都看不见了。 183 | Do you drink coffee? 你喝咖啡嗎? 184 | I promise I'll do that tomorrow. 我保證明天就做 185 | Some people never grow up. 有些人永远也长不大。 186 | I will write to you soon. 我會盡快寫信給你。 187 | I'm starving! 我饿死了! 188 | This place has a mysterious atmosphere. 这个地方有一种神秘的气氛。 189 | It takes 165 years for Neptune to orbit around the sun. 海王星繞行太陽一周要花一百六十五年。 190 | The curtain fell. 谢幕了。 191 | I expect you to be punctual. 我期待你能準時。 192 | While reading a book, I fell asleep. 我看书的时候睡着了。 193 | You shouldn't have eaten so much ice cream. 你不應該吃這麼多冰淇淋。 194 | You have a point there. 喔!你提到一個重點了。 195 | Quit gambling. 戒掉赌博吧。 196 | Cows provide us with milk. 奶牛为我们提供牛奶。 197 | She enjoyed herself at the concert. 她在音樂會上玩得很開心。 198 | She gave me a strange look. 她奇怪地看了我一眼。 199 | I am an optimist by nature. 我天生是一個樂觀主義者。 200 | He erased his speech from the tape. 他把他的那段话从磁带里删除了。 201 | Do you have a larger size? 你有比較大的尺寸嗎? 202 | Do they love each other? 他們彼此相愛嗎? 203 | Halley's Comet will come back in 2061. 哈雷彗星將在2061回來。 204 | I'm not living with him anymore. 我不再跟他一起生活了。 205 | My aunt brought me some flowers. 我阿姨給我帶來了一些花。 206 | I don't have a computer at home. 我家没有电脑。 207 | I returned the book to the library. 我把書還給圖書館。 208 | Who can tell the difference? 谁能说清不同点? 209 | I'll eat here. 我會在這裡吃飯。 210 | Tom needed treatment. 汤姆需要接受治疗。 211 | She must still be in her twenties. 她一定還只是二十幾歲。 212 | I wish Tom were my younger brother. 但願湯姆是我的弟弟。 213 | Do you eat out often? 你常常外食嗎? 214 | I can do it alone. 我可以獨自做。 215 | He is very stingy with his money. 他非常吝啬。 216 | Give me a bottle of wine. 給我一瓶葡萄酒。 217 | I really want to speak English fluently. 我很想流利地说英语。 218 | The Oscar ceremonies are Hollywood's biggest extravaganza. 奥斯卡颁奖典礼,是好莱坞最盛大的活动。 219 | Do you read French every day? 你每天读法语吗? 220 | How much is the rent per month? 一個月的租金多少? 221 | The clock stopped. 鐘停了。 222 | Who is to blame for the accident? 誰該為這次事故負責? 223 | I will give you a bike for your birthday. 你生日的时候,我送你一辆自行车。 224 | Deal with them. 解决他们。 225 | She squeezed the juice from many oranges. 她用了許多柳橙來榨汁。 226 | I've never seen that guy before. 我從沒看過那個人。 227 | It's obvious why his stomach hurts. 他為什麼會胃痛的原因很明顯。 228 | I wish I had a better memory. 但願我有好一點的記憶力。 229 | I don't feel like studying English today. 我今天不想学习英语。 230 | Sorry to have kept you waiting. 對不起讓你一直等。 231 | I don't think that she will come. 我不認為她會來。 232 | No student went to the party. 没有学生去参加派对。 233 | The roof was damaged by the storm. 屋頂被暴風雨損壞了。 234 | Today our artificial satellites are revolving around the earth. 今天我们的人造卫星正在环绕地球运转。 235 | He teaches English to his friends. 他對他的朋友教英語。 236 | Have you turned in your report? 你交報告了嗎? 237 | He wants a book to read. 他想找本書來讀。 238 | Sorry it took me so long to write to you. 對不起, 我過了這麼長的時間才回你的信。 239 | They forgot to lock the door. 他们忘了锁门。 240 | Have you answered that letter yet? 你回信了嗎? 241 | Tom told me he had a right to see it. 汤姆告诉我他有权看它。 242 | She really does like animals. 她确实喜欢动物。 243 | What's the climate there like? 那里的气候怎么样? 244 | I like children. That's why I became a teacher. 我喜欢孩子。这就是为什么我成为了教师。 245 | Here are some pictures. 這裡有一些圖片。 246 | May I have a glass of milk, please? 请问能给我一杯牛奶吗? 247 | I'll be at home in the morning. 我早上會在家。 248 | Will you show me your passport, please? 能否请您给我看一下您的护照? 249 | My father died of lung cancer. 我的父親死於肺癌。 250 | How did you enjoy the concert? 你有多喜歡這場音樂會? 251 | This is exactly what I wanted. 我想要的就是这个。 252 | That's my problem. 這是我的問題。 253 | Please let me go. 請允許我去。 254 | I've missed another chance. 我又失去了一次机会。 255 | I have a cough and a little fever. 我咳嗽,还有点发烧。 256 | I managed to get there in time. 我設法及時到那裡。 257 | The lock is broken. 锁坏了。 258 | Half of these apples are rotten. 這些蘋果的其中一半都爛了。 259 | I changed trains at Tokyo Station. 我在東京站換火車。 260 | She's in the garden planting roses. 她在花园里种玫瑰。 261 | Tom can't tell Mary his real feelings. 汤姆不能把他的真实感受告诉玛丽。 262 | The guests are in the kitchen. 客人們在廚房裏。 263 | Would you like some of those pictures? 你想要那些照片中的一些嗎? 264 | I appreciate your cooperation. 我感謝您的合作。 265 | I like swimming, but I don't like to swim here. 我喜欢游泳,但我不想在这里游。 266 | I don't mind if it's a little cold. 稍微冷一点没关系。 267 | The weatherman says there is a storm on the way. 气象学家说会有暴风雨。 268 | How about a drink after the game? 比賽結束後喝一杯怎麼樣? 269 | After you have read it, give the book back to me. 在你讀完後,把書還給我。 270 | I have a new car. 我有辆新车。 271 | I graduated from high school last year. 去年我從高中畢業了。 272 | Keep going straight. 繼續直行。 273 | Smoking is prohibited. 禁止吸烟。 274 | Get them before they get us. 先下手为强,后下手遭殃。 275 | Tom went into the kitchen and poured himself a cup of coffee. 汤姆进了厨房,给自己倒了杯咖啡。 276 | I think that's awful. 我看那糟透了。 277 | Orange juice, please. 柳橙汁,麻煩你。 278 | I continued singing. 我繼續唱歌。 279 | They rescued the boy from drowning. 他們救了這個落水的男孩。 280 | Tom is on the go day and night. 汤姆日夜兼程。 281 | You must be careful of the traffic when you cross the street. 横穿马路时你要留心。 282 | He plays golf two or three times a month. 他一个月玩两到三次高尔夫。 283 | Tom slept until noon. 汤姆睡到中午。 284 | He doesn't know how to swim. 他不会游泳。 285 | He'll do whatever you ask him to. 他會做任何你要求他做的事。 286 | Whether you like it or not doesn't matter. 你喜不喜欢没关系。 287 | You have no right to interfere in other people's affairs. 你沒有干涉他人事務的權力。 288 | I'll alert Tom. 我会警告汤姆。 289 | No problem. 没关系。 290 | You may take either the big box or the small one. 你可以拿大的盒子或是小的盒子。 291 | Do you know where Tom is waiting for us? 你知道汤姆在哪里等我们吗? 292 | I'm doing my homework. 我正在做我的作业。 293 | Nature is full of mysteries. 自然充满了神秘。 294 | You're barking up the wrong tree. 你白費力氣了。 295 | I'm not letting Tom do this by himself. 我不会让汤姆独自去做。 296 | She would often go to the theater when she was in London. 她在伦敦的时候,曾经常去剧院。 297 | We'll need a head hunting agency to find the right man for this executive position. 我們需要人力仲介公司幫我們找到合適的人來擔任這個管理職位。 298 | Can anyone believe you? 誰會相信你? 299 | He is making great progress in English. 他的英語有很大的進步。 300 | My father has been engaged in foreign trade for many years. 我父亲经营外贸多年。 301 | She has lost weight. 她的体重减轻了。 302 | I'm tired of your complaints. 我已經厭倦了你的投訴。 303 | I don't want to have an operation. 我不想接受手術。 304 | I like this color as well. 我也喜歡這顏色。 305 | I need to try. 我需要尝试。 306 | He left his umbrella on the bus. 他把雨伞忘在公交车上了。 307 | My aunt made me a new skirt. 我阿姨做了一條新裙子給我。 308 | She used to go to the movies on Sundays. 她從前會在星期天去看電影。 309 | Everyone knew Tom was the one who did it. 大家都知道是汤姆做的 310 | You might not like this beer at first. It's an acquired taste. 剛開始的時候你可能不會喜歡這個啤酒。這是需要多次品嚐去習慣它的口味。 311 | I am interested in music. 我對音樂有興趣。 312 | I don't get what you mean. 我不明白你的意思。 313 | Where in Turkey do you live? 你在土耳其哪儿生活? 314 | She showed us a beautiful hat. 她給我們看了一頂漂亮的帽子。 315 | He didn't come to the last meeting. 他最後一場會議沒來。 316 | It can be dangerous. 它有危险。 317 | I have no intention of asking him. 我不想問他。 318 | I agreed to the proposal. 我同意這項建議。 319 | It is like looking for a needle in a haystack. 这好比大海捞针。 320 | Summer has ended. 夏天已經結束。 321 | It's a very sad story. 这是一个非常悲伤的故事。 322 | We're out of stock now. 我们现在缺乏库存。 323 | Turn off the television. I can't concentrate. 把電視關掉。我無法專心。 324 | We need some more coffee. 我們需要多一點咖啡。 325 | Most boys like baseball. 大部分男生喜欢棒球。 326 | You should have told me a long time ago. 很久以前你就應該告訴我的。 327 | Swimming is good for your health. 游泳對你的健康很好。 328 | This is Uncle Tom's farm. 這是湯姆叔叔的農場。 329 | "Pass me the salt, please." "Here you are." “请把盐递给我。”“拿着。” 330 | You are a student. 你是學生。 331 | May I ask how old you are? 请问您老人家高寿? 332 | I've finished watering the flowers. 我已經澆完花了。 333 | Tom wanted Mary to say that she loved him. 汤姆想让玛丽说爱他。 334 | Please answer this question for me. 請回答我這個問題。 335 | Does she know your telephone number? 她知道您的電話號碼嗎? 336 | The picture looks better at a distance. 这幅画远看更好。 337 | I invited them to the party. 我邀請了他們參加派對。 338 | He returned from China. 他从中国回来了。 339 | The accident seemed to have something to do with the heavy snow. 事故似乎和厚厚的积雪有关。 340 | Do we have anything I can snack on? 我們有任何我可以當零食吃的東西嗎? 341 | I think his opinion is very important. 我認為他的意見非常重要。 342 | I am looking forward to Christmas. 我期待聖誕節的到來。 343 | Do you like to cook Japanese foods? 你喜歡煮日本料理嗎? 344 | Why should it be different? 为什么它应该要变得不一样? 345 | Computers are certainly playing an important role in our life, whether we like it or not. 无论我们是否喜欢电脑,它在我们的生活中始终起着重要的作用。 346 | Tom was all worn out. 湯姆完全筋疲力盡了。 347 | All the boys in class worked hard. 課堂上所有的男生都很用功。 348 | Studying abroad is very common now. 現在出國留學是很常見的了。 349 | Tom burst into tears. 汤姆泪流满面。 350 | Why do I have to do that? 我为什么一定要那么做? 351 | That bicycle is too small for you. 那輛腳踏車對你來說太小了。 352 | Do you know how to recover a deleted file? 你知道怎么恢复已删除的文件吗? 353 | Keep the dog out. 别让狗进来。 354 | I don't like novels without heroes. 我不喜欢没有英雄的小说。 355 | Is there a zoo in the park? 公园里有动物园吗? 356 | Tom suggested that I change the lock on my door. 汤姆建议我换我的门锁。 357 | I am very, very sorry. 我非常,非常抱歉。 358 | The best hairdressers are gay. 最好的理发师是同性恋。 359 | I love elderberry juice. 我愛接骨木果汁。 360 | She studied English in the morning. 她上午學習英語。 361 | The death penalty had been done away with in many states in the USA. 死刑在美国的很多州都被废除了。 362 | I understand perfectly. 我完全明白。 363 | What do you think of him? 你覺得他怎麼樣? 364 | Tom is an extraordinary man. 汤姆是个不寻常的人。 365 | The phone is ringing. 電話正在響。 366 | Could you check the tire pressure? 你能檢查一下這個輪胎的氣壓嗎? 367 | The charge for a front row seats is 5 dollars. 第一排的座位5美元。 368 | Obey your teachers. 要听老师的话。 369 | No words can express her deep sorrow. 她的悲伤无法言喻。 370 | His house was small and old. 他的房子又小又旧。 371 | You must get up a little earlier. 你该早一点起床。 372 | Please show your ticket. 请出示您的票子。 373 | He took a week off. 他休了一周的假。 374 | Could you please not smoke in this room? 请问你能不在房间里吸烟吗? 375 | I'm sorry for the late response. 我很抱歉回复晚了。 376 | My brother and I are in the same class. 我的兄弟和我在一个班级。 377 | There are about forty students in her class. 她班上大约有40个学生。 378 | I really want to know what's going on. 我真想知道发生了什么。 379 | The shell of an egg is easily broken. 蛋壳容易破碎。 380 | You've got a lot of guts. 你膽子很大。 381 | It's said that she loves him. 據說她愛他。 382 | I didn't like it. 我没有喜欢过。 383 | What do you have the first period? 你第一節課上什麼? 384 | He is now either in Rome or in Paris. 他不是在羅馬,就是在巴黎。 385 | He is a doctor by profession. 他的职业是医生。 386 | I'd like to be a guitarist. 我想要成为吉他手。 387 | Starting tomorrow, it's going to snow for a week. 雪从明天开始下,一直持续一个星期。 388 | My bicycle has a flat tire. 我的腳踏車輪胎沒氣了。 389 | Of course she can speak English. 她當然會講英語。 390 | Take things a little more seriously. 對事情比較正經嚴肅。 391 | Could I have a pillow and blanket? 給我一個枕頭和毛毯好嗎? 392 | Get out of here. 離開這裡。 393 | Stir the soup. 搅一下汤。 394 | We're almost broke. 我們快破產了。 395 | How many apples do you want? 你要多少個蘋果? 396 | There's a telephone in my room. 我的房間裡有一支電話。 397 | What prevented you from coming earlier? 為什麼你不能早點來? 398 | I don't need your help. 我不需要你的幫助。 399 | I'm at the airport now. 我现在在机场。 400 | Haven't we met before? 我们以前没见过吗? 401 | That's my final answer. 这是我的最终回答。 402 | I'm tired of her complaints. 我厭倦了她的抱怨。 403 | You must keep quiet for a few days. 你该安静几天。 404 | He asked her where her mother was. 他問她她的母親在哪裡。 405 | This law is applicable to all cases. 此法適用於所有情況。 406 | I want to go with you. 我想和你一起去。 407 | It's healthy to breathe deeply. 深呼吸有益健康。 408 | When she saw that they had no schools, she started one. 当她发现他们还没有学校,她就办了一个。 409 | I'm sure you'll love what we have on the menu tonight. 我肯定你会喜欢我们今晚的菜肴。 410 | Have you taken your medicine yet? 您已经吃过药了吗? 411 | The couple is walking hand in hand. 這對夫妻手牽手走路。 412 | There was a car accident yesterday. 昨天發生了一場車禍。 413 | Life is like a box of chocolates. 生活就像一盒巧克力。 414 | If she was displeased, she never showed it. 如果她不高興, 她從來不表現出來。 415 | I saw him running. 我看見了他跑步。 416 | Stick a stamp on the envelope. 把郵票貼在信封上。 417 | Do you want to know why I quit? 您想知道为什么我要离开吗? 418 | Who was in charge of today's party? 誰負責今天的派對? 419 | Is there an app for that? 有它的应用吗? 420 | If a sick person folds one thousand paper cranes, her wish will come true. 如果一個病人折一千隻紙鶴, 她的願望就會成真。 421 | Never forget to put out the fire. 永遠不要忘記關火。 422 | Do you plan to stay long? 你打算長時間停留嗎? 423 | I haven't seen Tom since 1988. 我從1988年起就沒有看過湯姆了。 424 | We made it out of there. 我們從那裡逃了出來。 425 | My father works at the factory. 我父亲在工厂工作。 426 | I usually go to bed before ten. 我通常在十點前上床睡覺。 427 | What is my room number? 我的房间号是多少? 428 | The man got away from the city. 這名男子逃離了這個城市。 429 | I don't blame you for doing that. 我不怪你那樣做。 430 | I missed you. 我想你。 431 | I'll give you anything that you want. 我會給你任何你想要的東西。 432 | I know the girl playing tennis. 我認識這個打網球的女孩。 433 | Thank you for inviting me to dinner. 谢谢你邀请我吃饭。 434 | She greeted us with a smile. 她面帶微笑向我們打招呼。 435 | Entering the house, I tripped over the mat. 进家门后,我被垫子绊倒了。 436 | If we knew what we were doing, it wouldn't be called research, would it? 如果我们知道我们在做什么,那么这不能称之为研究,是吗? 437 | The bookcase is level with the table. 书架和桌子齐平。 438 | There's too much salt in this soup. 这汤里盐放多了。 439 | You're an optimist. 你是个乐观主义者。 440 | The box is too heavy to carry. 這個箱子太重了無法攜帶。 441 | Tom was there this morning. 汤姆早上去过那里。 442 | I don't understand what he said. 我不明白他说的话。 443 | I treated her as my own daughter. 我把她當成是我自己的女兒一樣對待。 444 | We haven't finished eating the watermelon yet. 我們還沒吃完這個西瓜。 445 | He did not know what to say. 他不知道说什么好。 446 | Have you heard from him? 你收到他的音訊了嗎? 447 | I'll give you a little tip. 我会给你些提示。 448 | The oranges in this bag are rotten. 這個袋子裡的柳橙都爛了。 449 | There is no point in pretending to be sick. 装病是没用的。 450 | He wrote a book about a jungle adventure. 他写了一本关于丛林冒险的书。 451 | Both you and I are men. 你和我都是男人。 452 | The boy made fun of the girl. 這個男孩取笑了這個女孩。 453 | Second semester has ended. 第二學期結束了。 454 | Maybe you could draw me a picture. 你可以给我画张画。 455 | She cut the apple with a knife. 她用刀子切蘋果。 456 | This house belongs to my uncle. 這棟房子屬於我的叔叔。 457 | That car belongs in a museum. 那辆车属于一家博物馆。 458 | You can borrow my car anytime. 你隨時可以借用我的車。 459 | He is one of my neighbors. 他是我的一個鄰居。 460 | I carried the box on my shoulder. 我把盒子扛在肩上。 461 | Is this snake safe to touch? 摸這條蛇安全嗎? 462 | They told me it was your fault. 他们告诉我这是你的错。 463 | Everybody knows that he is honest. 大家都知道他是誠實的。 464 | I just don't want to lose you. 我只是不想失去你。 465 | I told them to send me another ticket. 我請他們再寄給我一張票。 466 | I saw it in the newspaper. 我在报纸上看到了它。 467 | Tom finally understood it. 汤姆最终明白了。 468 | The teacher pointed her finger at me and asked me to come with her. 教师用手指指着我,要我跟她走。 469 | Have you ever broken your glasses? 你摔坏过你的眼镜吗? 470 | Don't shout at me. 別對著我吼。 471 | News of the recent blast is all over the radio. 收音機廣播充斥著有關最近爆炸的新聞。 472 | Don't you miss anything? 难道你不想念什么吗? 473 | They were not listening to music. 他們沒在聽音樂。 474 | I do not drink coffee. 我不喝咖啡。 475 | Everybody wants to sit beside her. 大家都想坐在她旁边。 476 | You may think those shoes are in fashion, but they aren't. 你可能認為那些鞋子很時髦, 但是他們不是。 477 | He picked flowers for her. 他为她摘了些花。 478 | I can do it in a week. 我可以在一週內做。 479 | You're not satisfied, are you? 你并不满意,对吧? 480 | She weeded the garden. 她给花园除了草。 481 | Afraid of hurting his feelings, I didn't tell him the truth. 怕傷害了他的感情,我沒有告訴他真相。 482 | Tom is a hick. 汤姆是个乡巴佬。 483 | I don't want to miss my train. 我不想错过我的火车。 484 | I want to go to Seattle. 我想去西雅图。 485 | Tom has forgotten how to do that. 汤姆忘了怎么做。 486 | The army forced him to resign. 军队强迫他辞职。 487 | How about going to see a movie tonight? 今晚看电影怎么样? 488 | I need it ASAP. 我尽快需要。 489 | I'm in a bad mood today. 我今天的心情不好。 490 | In the winter, many older people slip on ice and fall down. 在冬天,很多老人在冰上滑倒。 491 | Something might have happened to her. 她可能出什么事了。 492 | Can I see that one? 我能看那個嗎? 493 | There's enough time for a quick snack. 有足夠的時間很快地吃一下點心。 494 | I stayed at home last night. 我昨晚待在家裡。 495 | I don't have the address now. 我沒有現在的地址。 496 | Do you know me? 你还认识我吗? 497 | I remember the first time. 我记得第一次。 498 | I don't think he'll say yes. 我不認為他會說好。 499 | Tom doesn't really love Mary. 汤姆不是真的爱玛丽。 500 | Money talks. 金钱万能。 501 | He hid himself behind the door. 他把自己藏在門後面。 502 | We imported meat from Argentina. 我们从阿根廷进口了肉类。 503 | Christmas is soon. 聖誕節快到了。 504 | I'm in love with her. 我愛上她了。 505 | I ran away in a hurry. 我趕快跑走了。 506 | With the weather getting worse, the departure was put off. 由于天气变差,出发延迟了。 507 | Tom's house has a nice garden. 湯姆的住宅有一個不錯的花園。 508 | I was invited to their wedding. 我被邀請參加他們的婚禮。 509 | I lost. 我迷失了。 510 | I quit smoking two years ago. 我兩年前戒菸了。 511 | Apart from some fruit, he hasn't eaten anything. 除了水果,他什么都没吃。 512 | Why do you need this money? 你為什麼需要這筆錢? 513 | These fireworks are spectacular! 这些焰火真壮观! 514 | He wants more. 他想要更多。 515 | Passengers should board the train now. 乘客应该现在下火车了。 516 | When was this university founded? 这所大学是什么时候建的? 517 | She dived into the swimming pool. 她跳入了游泳池。 518 | You should bring your passport to the bank. 你应该带护照去银行。 519 | I like oranges better than apples. 我喜歡橘子勝過蘋果。 520 | I got a farewell present from everyone. 每個人都送了我一份歡送禮物。 521 | Which tooth hurts? 哪顆牙痛? 522 | Tom didn't have enough money to pay the rent. 汤姆没有足够的钱付租金。 523 | I'm thirsty. 我渴了。 524 | I don't have a prejudice against foreign workers. 我对外籍员工没有偏见。 525 | I see her sweeping the room. 我看见她在打扫房间。 526 | Are you angry? 您生气了吗? 527 | Please pass me the butter. 请把黄油递给我。 528 | I can't do any more than this. 我無法再做下去了。 529 | Let's take the children to the zoo. 讓我們帶孩子們去動物園。 530 | I majored in chemistry at the university. 我在大學主修化學。 531 | He broke the window on purpose. 他故意打破了窗戶。 532 | I always study hard. 我總是用功讀書。 533 | The talks will last three days. 这场谈话将要持续三天。 534 | Please don't tell your parents this. 请不要告诉你父母。 535 | Dog is man's best friend. 狗是人类最好的朋友。 536 | We'll begin work soon. 我們立即開始施工。 537 | In a way you are right, but I still have doubts. 在某种程度上你是对的,但我还是有疑问。 538 | I wrote this book. 我写了这本书。 539 | He told the students to be quiet. 他告訴了學生要安靜。 540 | Didn't you lock up your car? 你没有把你的车上锁吗? 541 | What is the name of this river? 這條河叫什麼名字? 542 | She had died before I arrived. 她在我到達之前去世了。 543 | Take good care of yourself. 照顾好你自己。 544 | My underpants are wet. 我的內褲是濕的。 545 | I don't need you or anybody else. 我不需要你或别的人。 546 | He knows who they are. 他知道他们是谁。 547 | Tom can't afford this. 汤姆买不起这个东西。 548 | Tom didn't want to disappoint Mary. 汤姆不想让玛丽失望。 549 | It was very kind of you to lend me an umbrella. 你借给我伞真好。 550 | He put on the red jacket. 他穿上了紅色的外套。 551 | He offered his help to us. 他想我们提供了帮助。 552 | The moon is the earth's only satellite. 月球是地球唯一的卫星。 553 | April showers bring May flowers. 四月春雨,五月花。 554 | I went to sleep about 10 o'clock. 我在10點左右去睡覺。 555 | I went to a park this morning. 今天早上我去了公园。 556 | What's your name? 您叫什么名字? 557 | Are you afraid of Tom? 你會怕Tom嗎? 558 | We have less than five minutes to evacuate the whole building. 我们有不到五分钟来疏散整栋楼的人。 559 | She has a real knack for getting people to do what she wants. 她真的有本事让别人做她想做的事。 560 | Talking during a concert is rude. 在音樂會中說話是不禮貌的。 561 | You can hold my hand. 你能握我的手。 562 | He will arrive in Paris tomorrow. 他明天將抵達巴黎。 563 | This conference is very important. Don't miss it. 這場會議很重要,不要錯過了! 564 | Her success made her the target of jealousy. 她的成功让她成为嫉妒心的靶子。 565 | He plays the guitar very well. 他吉他彈得很好。 566 | I am disgusted with him. 我厌恶他。 567 | It is a pity that you cannot come to the party. 很遗憾您不能来派对。 568 | You should quit smoking. 你應該戒菸。 569 | What is learned in the cradle is carried to the tomb. 幼年時學的東西,一輩子不會忘記。 570 | He's eating an apple. 他正吃著一個蘋果。 571 | She is always losing her handkerchief. 她老是弄丢她的手帕。 572 | Please return the book by tomorrow. 請明天前還書。 573 | I'm proud of you. 我以你為榮。 574 | She spends her leisure time making dolls. 她利用空闲时间做布娃娃。 575 | Many kinds of birds live in Japan. 許多種鳥類住在日本。 576 | I wrote a letter to my mother. 我寫了一封信給我的母親。 577 | You and I have the same idea. 你和我有相同的想法。 578 | She should have been more careful. 她本來應該更小心的。 579 | I hope everything goes well. 我希望万事如意。 580 | I wish I could talk to you. 我希望能与你谈话。 581 | I would rather not go there alone. 我寧可不要單獨去那裡。 582 | You talk too much. 你說太多了。 583 | You must keep your eyes open. 你得留意一下。 584 | Nobody tried to help Tom. 没人试图帮汤姆。 585 | You two should get married. 你们两个应该结婚。 586 | Tom is probably pretty rich by now. 汤姆现在可能很有钱。 587 | How many does he want? 他要多少? 588 | Thank you for drawing a bird for me. 謝謝你為我畫鳥。 589 | I'm looking for a part-time job. 我正在找一份兼職的工作。 590 | I can read German, but I can't speak it. 我能看懂德语,但不会说。 591 | Something must be wrong with the machinery. 這個機器一定有什麼地方不對勁。 592 | I want beef, too. "我也要牛肉。" 593 | I'd rather stay than go. 我寧願待在這裡而不去。 594 | I had a tennis match with him. 我曾與他的比賽網球。 595 | He's a comedian. 他是喜剧演员。 596 | India is a developing country. 印度是发展中国家。 597 | He is old enough to drive. 他夠開車的年紀了。 598 | I arrived in London. 我到達倫敦了。 599 | I saw five men. 我看到了五個男人。 600 | I got up early in the morning. 我早上很早起床。 601 | I hate chemistry. 我讨厌化学。 602 | May I ask you some more questions? 我可以問你一些問題嗎? 603 | I don't understand. 不明白。 604 | May I say something? 我可以說些什麼嗎? 605 | What is this? 這是什麼啊? 606 | I am going to Hawaii next year. 我明年要去夏威夷。 607 | Don't throw rocks into the river. 不要往河裡扔石頭。 608 | I've had enough of your lying. 我听够你的谎话了。 609 | We had an early lunch at school. 我們在學校有一個早午餐。 610 | The road is too narrow for cars. 這條路太窄汽車無法通行。 611 | I do not agree with you at all. 我完全不赞成你的意见。 612 | Are you saying you don't want to go to Tom's party? 你的意思是你不想去汤姆的派对? 613 | Tom knew when Mary would be arriving. 汤姆知道玛丽什么时候来。 614 | Pride goes before a fall. 骄傲使人落后。 615 | Why did you turn down his offer? 你為什麼拒绝了他的提議? 616 | Japan is a rich country. 日本是个富有的国家。 617 | I can't stand that noise. 我不能忍受那個噪音。 618 | You'd better go. 你最好走。 619 | Turn off the TV. 关闭电视机 620 | I don't want to work. 他不爱劳动。 621 | I hope to see you. 我希望能见到你。 622 | Tom got a late start this morning. 汤姆今早出发晚了。 623 | He is not from Hokkaido. 他不是來自北海道。 624 | What subjects are you taking at school? 你在學校裡讀哪些科目? 625 | Get Tom. 找到汤姆。 626 | What is the total amount of money you spent? 你一共花了多少钱? 627 | He likes taking care of the garden. 他喜歡照顧花園。 628 | Her red dress made her stand out. 她的紅色禮服使她引人注目。 629 | At last, she solved the problem. 最後,她解決了這個問題。 630 | He made many excuses for being late. 他為遲到找了很多的藉口。 631 | Will you all be here tomorrow? 明天你一整天都會在這裡嗎? 632 | She gives me a nasty look every time she sees me. 她每次看見我都給露出厭惡的眼神。 633 | Close the door when you leave. 出去的时候把门关上。 634 | They passed by her house yesterday. 他們昨天路過她家。 635 | Excuse me, I'm lost. 不好意思,我迷路了。 636 | This is what I can do for you. 这是我能为您做的。 637 | We gladly accept your offer. 我们很高兴接受你的提议。 638 | I want to drink something cold. 我想喝冷飲。 639 | You look tired. 你看起來很疲倦。 640 | I'll go shopping tomorrow. 我明天要去购物。 641 | I would like to know her name. 我想知道她的名字。 642 | My father has five siblings. 我父親有五個兄弟姐妹。 643 | She glanced briefly at the newspaper. 她很快地瞟了一眼報紙。 644 | The parking lot is free of charge. 停车场是免费的。 645 | I will pick you up around six. 我會在六點鐘左右接你。 646 | We don't have any other choice. 我们别无选择。 647 | The sun rose over the horizon. 太陽升出了地平線。 648 | Do you feel tired? 你覺得累嗎? 649 | A green carpet will not go with this blue curtain. 绿色的毯子和这条蓝色的帘子不配。 650 | Maybe you'll succeed. 也許你會成功。 651 | I feel very sorry for your sister. 我對你姐姐感到非常遺憾。 652 | That red dress suited her. 那件紅色的洋裝適合她。 653 | You need a car if you live in the suburbs. 要住在郊区的话,汽车是必需的。 654 | How come you didn't say anything? 你為什麼都不說話? 655 | I meet him at the club. 我在俱樂部見到他。 656 | Today was fun. 今天很有趣。 657 | I don't know how to go there. 我不知道怎么去那儿。 658 | Give us two knives and four forks, please. 请给我们两把刀和四把叉。 659 | She made him a new suit. 她為他做了一套新衣服。 660 | I want to see your older sister. 我想見你姐姐。 661 | Her argument was not based on facts. 她的观点没有基于事实。 662 | The cost of living has risen. 生活費升高了。 663 | Don't show your face around here again. 你不要再出現在這裡了。 664 | I want you to stay here until I get back. 我想你待在这里直到我回来。 665 | Do you know what I mean? 你知道我的意思嗎? 666 | The accident's only survivor was a baby. 这场灾难的唯一幸存者是一个婴儿。 667 | It's just what I wanted. 我想要的就是这个。 668 | How about playing chess tonight? 今晚下棋怎麼樣? 669 | Actions speak louder than words. 行動勝於雄辯。 670 | Tom is waiting for everyone to leave. 汤姆等着每个人都离开。 671 | I don't mind hot weather. 我不在乎炎熱的天氣。 672 | He is always losing his umbrella. 他總是搞丟了他的傘。 673 | My birthday falls on Sunday. 我的生日在星期日。 674 | He plays baseball every day. 他每天都打垒球。 675 | Hokkaido is very far, isn't it? 北海道很遠,不是嗎? 676 | She's neither rich nor famous. 她既没钱,也不出名。 677 | I went skiing at Zao last winter. 我去年冬天去藏王滑雪。 678 | Please show me the menu. 請給我菜單。 679 | Give it to them. 把它给他们。 680 | I can't stand this heat. 我受不了這麼熱。 681 | Does this bus go to the beach? 這輛公車去海灘嗎? 682 | Please show me around. 請帶我到處看看。 683 | Take as many peaches as you like. 你想拿多少桃子就拿多少。 684 | I heard the song sung in French. 我听过这首歌的法语版。 685 | She's absent because she's sick. 她不在是因为病了。 686 | The problem was very difficult. 這個問題非常困難。 687 | I live in this neighborhood. 我住在這附近。 688 | You're a real friend. 你是個真正的朋友。 689 | The robot went out of control. 這個機器人失控了。 690 | If it happened to Tom, it could happen to you. 如果那發生在Tom身上,也可能會發生在你身上。 691 | He goes to school on foot. 他步行上学。 692 | I just talked to the person in charge. 我刚跟主管人谈过。 693 | He's an excellent brain surgeon. 他是一個優秀的腦外科醫生。 694 | Under no circumstances must you leave the room. 在任何情況下你都不能離開這個房間。 695 | Please call me up between seven and eight. 请在7点到8点钟之间打电话给我。 696 | They became friends in elementary school. 他們在小學時就是朋友了。 697 | They live across the river. 他們住在河對面。 698 | He is alone. 他獨自一人。 699 | He is always at home on Mondays. 他星期一總是在家。 700 | Give me some milk, too. 也給我一些牛奶。 701 | He made friends with her in America. 他與她在美國成為朋友。 702 | Are you planning to take part in the meeting? 你们准备参加会议吗? 703 | I'm sure that it'll be fun. 我确定它将会很有趣 704 | You should pay more attention to his warnings. 你應該多注意他的警告。 705 | His mother looks young for her age. 他媽媽看起來比實際年齡年輕。 706 | It happened at a quarter past eleven. 它發生在十一點一刻。 707 | I know that it is highly unlikely that anyone knows me. 我知道有人认识我的可能性微乎其微。 708 | We kept quiet. 我们保持了沉默。 709 | What's she doing? 她在做什麼? 710 | This is not a sentence. 這不是一個句子。 711 | Cover your head when you are in the sun. 當你在陽光下的時候,遮住你的頭。 712 | She believes that he is innocent. 她相信他是無辜的。 713 | The taxi has arrived. 出租车到了。 714 | Why did you paint the bench red? 为什么你把长凳漆成红色了? 715 | I'd like a window seat, please. 請給我靠窗口的位子。 716 | She always gets her own way. 她總是隨心所欲。 717 | He rushed out of the office. 他急忙出了办公室。 718 | Lansing is the state capital of Michigan. 蘭辛是密西根州的首府。 719 | He has no interest in politics. 他對政治沒有興趣。 720 | What I'm about to say is strictly between you and me. 我要说的只能是你知我知。 721 | He thanked me for coming. 他感謝我的到來。 722 | I threw a stone at the bird. 我扔了块石头打向鸟儿。 723 | I didn't get along with her. 我沒有和她相處過。 724 | We didn't break in. 我们没有打断。 725 | How did you draw this picture? 你怎么画这幅画? 726 | That's too bad. 那太糟糕了。 727 | He lives somewhere about here. 他住在這附近某個地方。 728 | The deadline is approaching. 期限近了。 729 | He was caught cheating in the exam. 他在考試中作弊時被抓到了。 730 | It is imperative for you to act at once. 您必须马上行动。 731 | I want to see him very much. 我非常想见到他。 732 | He likes to cook for his family. 他喜歡為家人做飯。 733 | I'm currently a teacher at this school. 我现在在这所学校任教。 734 | I was surprised at his strong resemblance to his father. 他像极了他父亲,这让我震惊。 735 | Her mother always accompanies her. 她母亲一直陪着她。 736 | We set a trap to catch a fox. 我们设了个陷阱来抓狐狸。 737 | Do I have to stay in the hospital? 我必須留在醫院嗎? 738 | Eighty percent of all information on computers around the world is in English. 全世界百分之八十電腦上的資訊都是用英語寫的。 739 | She really wants to go. 她特别想去。 740 | Can I speak to the head nurse? 我能跟護士長說話嗎? 741 | He didn't show up at the party. 他沒有在派對上出現。 742 | Please wrap it like a Christmas present. 請把它包裝得像一個聖誕禮物。 743 | Alcohol consumption is increasing every year. 酒的消费每年都在上升。 744 | Monopoly is a popular game for families to play. 大富翁是一個家庭玩的熱門遊戲。 745 | Don't beat around the bush. 不要拐彎抹角。 746 | I have eight brothers and sisters. 我有八個兄弟姐妹。 747 | Hurry, and you will catch the train. 快一点,你就能赶上火车了。 748 | No one voted against it. 没有人投反对票。 749 | She asked him questions. 她問了他問題。 750 | I am quite all right now. 我一切都很好。 751 | I can play Chopin. 我會彈蕭邦。 752 | He was wounded in the war. 他是在战争中受伤的。 753 | I don't like the taste of onions. 我不喜歡洋蔥的味道。 754 | He cannot afford to buy a car, much less a house. 他买不起一辆汽车,更不要说一套房子了。 755 | Don't hesitate to ask questions. 不要猶豫去問問題。 756 | Aren't you Tom? 你不是湯姆嗎? 757 | He's stronger than you. 他比你強壯。 758 | Your assistance is indispensable for us. 您的帮助对我们来说是必不可少的。 759 | Why am I still here? 为什么我还在这里? 760 | Watching TV is a passive activity. 看電視是一種被動的活動。 761 | What have you come here for? 你为什么来这儿? 762 | I want to ask you some questions about Tom. 我想问你一些关于汤姆的问题。 763 | Would you pass the salt, please? 請你把鹽遞過來好嗎? 764 | There is a television in the room. 房里有个电视机。 765 | Could you please speak a little bit more slowly? 能否請你說慢一點? 766 | I forgot the date of the meeting. 我忘了會議的日期。 767 | What is love? 愛是什麼? 768 | My legs still hurt. 我的腿还是很痛。 769 | Between you and me, I don't like our new team captain. 我就只告诉你,我不喜欢我们的新队长。 770 | Are you really going to London to study? 你要去伦敦读书是真的吗? 771 | He has already had lunch. 他已经吃过午饭了。 772 | I owe him 1,000 dollars. 我欠他1000美元。 773 | I don't have to clean my room. 我不用打扫房间。 774 | He is not an American. 他不是美国人。 775 | Turn off the gas. 把煤气关了! 776 | I have met him before. 我以前见过他。 777 | It's not that easy to learn a new language after fifty. 五十岁以后学一门新的语言不是那么容易。 778 | It's impossible to get there by noon. 中午到達那裡是不可能的。 779 | The teacher demonstrated the idea with an experiment. 这位老师用试验论证了这个想法。 780 | Everyone noticed. 所有人都注意到了。 781 | He plays tennis very well. 他打网球打得很好。 782 | That's good, isn't it? 那太好了,不是嗎? 783 | Get me a ticket, please. 請給我一張票。 784 | You only gave me fifty cents. 你只給了我五十美分。 785 | That's an interesting idea. 那是个有趣的主意。 786 | I clapped my hands. 我拍手。 787 | You could've answered that question. 你本可以回答那问题。 788 | What's your marital status? 能告诉我你的婚姻状况吗? 789 | I do not remember seeing the letter, but perhaps I read it. 我不記得我看過這封信, 但或許我讀過它。 790 | She might know that we are here. 她或许知道我们在这儿。 791 | I was ashamed of my behavior. 我对自己的行为感到羞愧。 792 | He hasn't answered my letter yet. 他还没回我的信。 793 | This writer is Russian. 这个作家是俄罗斯人。 794 | The cat got through the hedge. 貓從樹籬穿過去。 795 | Will it take long to recover? 復原需要花很長的時間嗎? 796 | I envied his new house. 我羨慕他的新房子。 797 | When does that start? 什么时候开始? 798 | She doesn't have any enemies. 她没有敌人。 799 | Winds from the sea are moist. 从海洋吹来的风感觉湿漉漉的。 800 | Do you have these shoes in my size? 你們這款鞋子有我的尺寸嗎? 801 | I wish you'd go. 我希望你去。 802 | I will stay until tomorrow. 我会待到明天。 803 | I'll prove it to you. 我会给你证明的。 804 | He kept his promise and helped his brothers. 他履行了他的承诺,并且帮助了他的兄弟。 805 | Where in Canada are you from? 你來自加拿大的哪裡? 806 | How about some more roast beef? 再多一些烤牛肉怎麼樣? 807 | Have you ever seen Tokyo Tower? 你曾看過東京鐵塔嗎? 808 | My mother is a good woman. 我媽媽是個好女人。 809 | I was unable to go to his birthday party. 我那时没法去他的生日派对。 810 | Are you planning to help them? 你打算幫助他們嗎? 811 | What time will you have breakfast? 你什麼時候吃早餐? 812 | Why me? 为什么是我? 813 | This book is beautifully illustrated. 这本书有精美的插图。 814 | The fox and the bear lived together. 這隻狐狸和這隻熊一起生活了。 815 | Do you think I'm crazy? 你认为我疯了吗? 816 | Read it once more, please. 請再讀一次。 817 | It looks like a duck. 它看起來像一隻鴨子。 818 | Are you looking for work? 你是在找工作吗? 819 | My father walks. 我爸爸走路。 820 | Hopefully, we'll enjoy our China trip. 希望我們會喜歡我們的中國之旅。 821 | Why don't you go home? 你为什么不回家呢? 822 | I'm going to check. 我正要去签到。 823 | Tom's uncle keeps a lot of sheep. 湯姆的叔叔養了很多羊。 824 | I'll treat you. 我請你。 825 | Could you take this, please? 請你拿這個好嗎? 826 | He arrived after I had left. 我走之后他到达了。 827 | I need someone to talk with. 我得找人商量一下。 828 | Skip it. 不管它。 829 | Would you like some salad? 你要來點兒沙拉嗎? 830 | I sometimes watch TV. 我有時看電視。 831 | I'm full. 我吃飽了。 832 | He is a learned man. 他是个有教养的人。 833 | I was very busy last week. 上星期我非常地忙。 834 | I will wait. 我會等。 835 | It's too dark to play baseball now. 現在太暗無法打棒球。 836 | Children like fruit juice. 孩子們喜歡果汁。 837 | She wouldn't let up until I agreed to go to the movies with her. 她不愿松开我,直到我同意和她去电影院。 838 | He asked me a question. 他問了我一個問題。 839 | Tom bought a mobile phone. 汤姆买了个手机。 840 | I hope people are satisfied. 我希望人们满意。 841 | My father retired at the age of 65. 我的父亲65岁的时候退休了。 842 | The poor people were at the mercy of the cruel dictator. 可憐的人民任憑殘暴的獨裁者處置。 843 | It's clear that there's a rather strong disagreement between the two. 很明顯的是這兩者之間有很強烈的分歧。 844 | That's no big deal. 那没什么大不了的。 845 | What she said did not make sense. 她說的話沒有道理。 846 | He is good at playing tennis. 他打网球打得很好。 847 | I recognized some of the tunes that Tom played. 我认出了些汤姆演奏的调子。 848 | I lived in Japan three years ago. 我三年前住在日本。 849 | Yesterday I spent the whole day working. 昨天我一整天都在工作。 850 | It's not something anyone can do. 这不是任何人都能做的事。 851 | Tom doesn't have a fever. 汤姆没有发烧。 852 | "Will he pass the examination?" "I am afraid not." "他會通過考試嗎?" "我怕是不會。" 853 | I'd like a glass of water, please. 我要一杯水,謝謝。 854 | They fell into the conversation immediately. 他们很快就聊起来了。 855 | What did you do last night? 你昨天晚上做什麼? 856 | He knows how to swim. 他会游泳。 857 | They got married and settled near Boston. 他們結了婚並定居在波士頓附近。 858 | I've heard that name somewhere before. 我在别处听过那个名字。 859 | I'll be in my office from ten tomorrow. 我明天十点起会在办公室里。 860 | Does he go to school on foot or by bicycle? 他走路去学校还是骑车去学校? 861 | I haven't started anything yet. 我还没开始做任何事。 862 | He often takes me for a drive. 他常常載我去兜風。 863 | How much money do you have? 你有多少錢? 864 | You were at my wedding. 你出席了我的婚礼。 865 | There is a good chance that gasoline prices will go up. 油价上涨的可能性很高。 866 | My plan is to buy a car. 我打算買輛車。 867 | He is a famous baseball player. 他是一位著名的棒球選手。 868 | He told me to wash my face. 他叫我洗脸。 869 | Thank you for your invitation. 感謝您的邀請。 870 | He went to New York on Monday. 他星期一去了紐約。 871 | Be quiet, or the baby will wake up. 安靜,否則嬰兒會醒來。 872 | English is easy to learn. 英語簡單易學。 873 | I just wish we could leave this horrible place. 我只是希望我们能离开这个可怕的地方。 874 | I have absolute trust in you. 我絕對信任你。 875 | She couldn't answer the question. 她無法回答這個問題。 876 | I sometimes still think about her. 有時候,我還是會想起她。 877 | He has been very busy this week. 他這個星期一直很忙碌。 878 | Everybody is immune to smallpox nowadays. 現今每個人都對天花免疫了。 879 | We should call the police. 我们该报警。 880 | Your proposal is worthy of being considered. 你的提议值得考虑。 881 | She looks unhappy. 她看起來不快樂。 882 | The taxi picked up two passengers. 這輛計程車載了兩名乘客。 883 | I've always wanted to climb Mt. Fuji. 我一直想登富士山。 884 | They greeted me with a smile. 他們面帶微笑向我打招呼。 885 | My father has a restaurant. 我父亲有家餐馆。 886 | I don't expect anything from you. 我不指望从你身上得到什么。 887 | He has been to Hokkaido. 他曾去過北海道。 888 | Tom gave me a pen. 湯姆給了我一枝筆。 889 | I go to church every day. 我每天上教堂。 890 | She's dieting. 她在节食中。 891 | He likes watching TV. 他喜歡看電視。 892 | Tom should've already discussed that with you. 汤姆应该已经跟你讨论过那事了。 893 | I've finished reading the book. 我看完了这本书。 894 | Our school's principal is very old. 我校校长很老了。 895 | You'll get used to it. 你会习惯的。 896 | Tom's house has three bedrooms. Tom的房子有三個房間。 897 | He's a tall boy. 他是一个高大的男孩。 898 | I can't believe you are eating something the doctor has told you repeatedly you shouldn't eat. 真的不敢相信你在吃医生叮嘱不要食用的东西。 899 | You don't need to work on Sundays. 星期天的時候,你不用工作。 900 | Don't throw a stone at the dog. 不要对狗丢石头。 901 | He makes the most of his opportunities. 他充分利用他的機會。 902 | Black suits you. 黑色很衬你。 903 | Give me something to write with. 給我些可以寫字的東西。 904 | They must have made a mistake. 他們一定是犯錯了。 905 | Would you like to drink anything? 你想喝點什麼嗎? 906 | How much sugar do you use? 你用多少糖? 907 | She showed the visitor her baby. 她给客人看了她的宝宝。 908 | You have to be patient. 你必須有耐心。 909 | Could I borrow a pencil? 我能借支铅笔吗? 910 | All plants need water and light. 所有的植物都需要陽光和水。 911 | One of your buttons has come off. 你的一個按鈕脫落了。 912 | You know too much. 你知道得太多了。 913 | We didn't know what to do next. 我們不知道下一步要做什麼。 914 | This is not important. 這個不重要。 915 | He died of lung cancer. 他死於肺癌。 916 | Have you had your eyesight checked recently? 你最近检查视力了吗? 917 | What's the weight of your suitcase? 你的行李多重? 918 | What happened to you last night? 昨晚你發生了什麼事? 919 | I don't smoke weed. 我不吸大麻。 920 | I asked her out on a date. 我請她出去約會。 921 | I don't know if I'll have time to do it. 我不知道我是否有时间做。 922 | Excuse me for interrupting you. 對不起,打擾你了。 923 | Do you want to be rich? 你想致富嗎? 924 | It's on the sofa. 它在沙發上。 925 | I knew what Tom was doing. 我知道汤姆在做什么。 926 | I respect the elderly. 我尊敬长辈。 927 | A bookstore in that location wouldn't make enough money to survive. 那個地點的書店無法賺足夠的錢生存下去。 928 | I know the problem. 我知道问题。 929 | Where did you see her? 你在哪儿看到了她? 930 | Will you switch seats with me? 您愿意跟我换座位吗? 931 | She has been absent since last Wednesday. 从上周三起,她一直缺席。 932 | Tom hugged Mary. 汤姆拥抱了玛丽。 933 | My father made me a delicious lunch. 我父親為我做了一頓美味的午餐。 934 | The wind calmed down. 风停了。 935 | Be careful not to drive the wrong way on a one-way street. 小心不要在单行道逆向行驶。 936 | A plastic dish will melt on the stove. 塑料盘子在烤箱里会化的。 937 | It cost him 50 dollars to rent a car in Hawaii. 他花了50美元在夏威夷租了一辆汽车。 938 | You work as hard as he did at your age. 你跟他在你這個年紀時一樣努力工作。 939 | The United States is abundant in natural resources. 美国的自然资源很丰富。 940 | Please write with a pen. 请用钢笔写。 941 | I want Tom to win. 我想让汤姆赢。 942 | Many of the immigrants changed their names. 许多移民改了名字。 943 | French is my mother tongue. 法语是我的母语。 944 | My brother takes care of our dog. 我弟弟照顧我們的狗。 945 | Give some meat to the dog. 給這隻狗一些肉。 946 | It's a pleasure to have you with us again. 真高兴你又跟我们在一起了。 947 | We generally drink tea after a meal. 我們通常飯後喝茶。 948 | I really need a drink now. 我现在真需要来杯喝的。 949 | Look at that tall building. 看那棟高樓。 950 | He and his sisters are currently living in Tokyo. 他和他的姐妹们目前都住在东京。 951 | I've lost my pen. 我弄丟了我的筆。 952 | Those are my trousers. 那些都是我的褲子。 953 | Don't be cruel to animals. 不要虐待动物。 954 | Mt. Fuji is Japan's tallest mountain. 富士山是日本最高的山。 955 | Forewarned is forearmed. 凡事要預先準備好。 956 | What prevented you from coming earlier? 什麼阻止你早點來了? 957 | I have a high fever. 我烧得很厉害。 958 | Our school is near the station. 我們學校在車站的附近。 959 | None of these buses go to Shinjuku. 這些巴士中沒有一輛去新宿。 960 | I've got things under control. 我控制住了。 961 | It's 3:30. 3点半了。 962 | That child has few friends. 那孩子沒有什麼朋友。 963 | Being a good conversationalist does not just mean being a good speaker of English. 作為一個良好的交談者,並不只意味著作一個英語說得好的說話者。 964 | My parents have gone to the airport to see my uncle off. 我父母去机场送我叔叔了。 965 | Shortly after the accident, the police came. 事故發生後不久,警察來了。 966 | Not everything can be bought with money. 不是所有的東西都可以用金錢買到。 967 | Professional writers do not have a regular income. 專職作家沒有固定的收入。 968 | I saw a strange woman there. 我看见一位陌生女人在那儿。 969 | You have to speak English here. 你在這裡必須說英語。 970 | What did you major in at college? 你大學時主修什麼? 971 | Tom probably knew who I was. 汤姆可能知道我是谁。 972 | You are her daughters. 你是她的女兒。 973 | Stand up. 起立。 974 | Classes begin next week. 课程下周开始。 975 | I am afraid of death. 我怕死。 976 | Please do not open the windows. 请不要开窗。 977 | Write it down before you forget it. 在你忘記之前把它寫下來。 978 | I have not heard from her recently. 我最近沒有收到她的信。 979 | Don't open that. 别打开那个。 980 | Rice is sold by the kilogram. 米以公斤為單位來出售。 981 | I know what you did. 我知道你做了什么。 982 | I'd like seats on the first floor. 我想要一樓的座位。 983 | Please send me a letter. 請寄信給我。 984 | We go to the same school. 我們上同一所學校。 985 | I asked for a table over there. 我在那裡要了一張桌子。 986 | She stayed in the house all day. 她整天待在房子裡。 987 | What time do you leave for school? 你幾點鐘去學校? 988 | Tom didn't understand what the teacher said. 汤姆没明白老师说了什么。 989 | What souvenir do you think she would like most? 你觉得她最想要什么纪念品? 990 | She has an eye for beauty. 她有一雙美麗的眼睛。 991 | They were carelessly unaware of the danger. 他们粗心大意,还没意识到危险。 992 | Let me know when you need me again. 再需要我就告诉我。 993 | I don't want to be friends with you. 我不想跟你交朋友。 994 | I'm old enough to live by myself. 我年紀夠大了可以自己一個人住。 995 | Does the letter need to be written in English? 這封信需要用英文寫嗎? 996 | I should be happy. 我該高興。 997 | Rome wasn't built in a day. 罗马不是一天建成的。 998 | A little walk will give you a good appetite for breakfast. 散散步將會給你很好的食慾吃早餐。 999 | You should talk directly to Tom. 你应该直接跟汤姆说。 1000 | Are you able to play organ? 你能演奏管风琴吗? 1001 | -------------------------------------------------------------------------------- /nmt/en-fr/_about.txt: -------------------------------------------------------------------------------- 1 | ** Info ** 2 | 3 | Check for newest version here: 4 | http://www.manythings.org/anki/ 5 | Date of this file: 6 | 2018-10-27 7 | 8 | This data is from the sentences_detailed.csv file from tatoeba.org. 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv 10 | 11 | 12 | 13 | ** Terms of Use ** 14 | 15 | See the terms of use. 16 | These files have been released under the same license as the 17 | source. 18 | 19 | http://tatoeba.org/eng/terms_of_use 20 | http://creativecommons.org/licenses/by/2.0 21 | 22 | Attribution: www.manythings.org/anki and tatoeba.org 23 | 24 | 25 | 26 | ** Warnings ** 27 | 28 | The data from the Tatoeba Project contains errors. 29 | 30 | To lower the number of errors you are likely to see, only 31 | sentences by native speakers and proofread sentences have 32 | been included. 33 | 34 | For the non-English language, I made these (possibly wrong) 35 | assumptions. 36 | Assumption 1: Sentences written by native speakers can be 37 | trusted. 38 | Assumption 2: Contributors to the Tatoeba Project are honest 39 | about what their native language is. 40 | 41 | For English, I used the sentences that I have proofread 42 | and thought were OK. 43 | Of course, I may have missed a few errors. 44 | 45 | 46 | 47 | ** Downloading Anki ** 48 | 49 | See http://ankisrs.net/ 50 | 51 | 52 | 53 | ** Importing into Anki ** 54 | 55 | Information is at http://ankisrs.net/docs/manual.html#importing 56 | 57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating. 58 | You can choose: 59 | 1. not to allow duplicates (alternate translations) as cards. 60 | 2. allow duplicates (alternate translations) as cards. 61 | --------------------------------------------------------------------------------