└── seq2seq-rnn-attn.ipynb /seq2seq-rnn-attn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pdb\n", 10 | "from fastai.text import *" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "fastai version: 1.0.53.dev\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "print(f'fastai version: {__version__}')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "torch.cuda.set_device(1)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "[PosixPath('/home/wgilliam/.fastai/data/giga-fren/giga-fren.release2.fixed.fr'),\n", 48 | " PosixPath('/home/wgilliam/.fastai/data/giga-fren/giga-fren.release2.fixed.en')]" 49 | ] 50 | }, 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "PATH = Path('data/translate')\n", 58 | "PATH.mkdir(parents=True, exist_ok=True)\n", 59 | "\n", 60 | "DATA_PATH = untar_data(URLs.MT_ENG_FRA)\n", 61 | "DATA_PATH.ls()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "folder = 'giga-fren.release2.fixed'\n", 71 | "en_folder = DATA_PATH/f'{folder}.en'\n", 72 | "fr_folder = DATA_PATH/f'{folder}.fr'" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Prepare data\n", 80 | "\n", 81 | "*only need to run through the section 1x to build the .csv file*" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "re_eq = re.compile('^(Wh[^?.!]+\\?)')\n", 91 | "re_fq = re.compile('^([^?.!]+\\?)')\n", 92 | "\n", 93 | "lines = ((re_eq.search(eq), re_fq.search(fq)) \n", 94 | " for eq, fq in zip(open(en_folder, encoding='utf-8'), open(fr_folder, encoding='utf-8')))\n", 95 | "\n", 96 | "qs = [ {'english_text': e.group(), 'french_text': f.group()} for e, f in lines if e and f ]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "qs[:5]" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "df = pd.DataFrame(qs)\n", 115 | "df.head()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "df.to_csv(PATH/'english_french_translate.csv', index=False)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Prepare data for training" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "np.random.seed(42)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "52331\n" 153 | ] 154 | }, 155 | { 156 | "data": { 157 | "text/html": [ 158 | "
\n", 159 | "\n", 172 | "\n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | "
english_textfrench_text
0What is light ?Qu’est-ce que la lumière?
1Who are we?Où sommes-nous?
2Where did we come from?D'où venons-nous?
3What would we do without it?Que ferions-nous sans elle ?
4What is the absolute location (latitude and lo...Quelle sont les coordonnées (latitude et longi...
\n", 208 | "
" 209 | ], 210 | "text/plain": [ 211 | " english_text \\\n", 212 | "0 What is light ? \n", 213 | "1 Who are we? \n", 214 | "2 Where did we come from? \n", 215 | "3 What would we do without it? \n", 216 | "4 What is the absolute location (latitude and lo... \n", 217 | "\n", 218 | " french_text \n", 219 | "0 Qu’est-ce que la lumière? \n", 220 | "1 Où sommes-nous? \n", 221 | "2 D'où venons-nous? \n", 222 | "3 Que ferions-nous sans elle ? \n", 223 | "4 Quelle sont les coordonnées (latitude et longi... " 224 | ] 225 | }, 226 | "metadata": {}, 227 | "output_type": "display_data" 228 | } 229 | ], 230 | "source": [ 231 | "df = pd.read_csv(PATH/'english_french_translate.csv')\n", 232 | "\n", 233 | "print(len(df))\n", 234 | "display(df.head())" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "**Why am I restricting my datasets to the mean length of French or corresponding English translation?**\n", 242 | "\n", 243 | "Including the longer documents tend to require much more training, consume more memory, and don't necessarily work all that great for models such as we're building here." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 8, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "MAX_SEQ_LEN = np.max([df.english_text.str.len().mean(), df.french_text.str.len().mean()])" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 9, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/html": [ 263 | "
\n", 264 | "\n", 277 | "\n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | "
english_textfrench_text
0What is light ?Qu’est-ce que la lumière?
1Who are we?Où sommes-nous?
2Where did we come from?D'où venons-nous?
3What would we do without it?Que ferions-nous sans elle ?
4What is the major aboriginal group on Vancouve...Quel est le groupe autochtone principal sur l’...
\n", 313 | "
" 314 | ], 315 | "text/plain": [ 316 | " english_text \\\n", 317 | "0 What is light ? \n", 318 | "1 Who are we? \n", 319 | "2 Where did we come from? \n", 320 | "3 What would we do without it? \n", 321 | "4 What is the major aboriginal group on Vancouve... \n", 322 | "\n", 323 | " french_text \n", 324 | "0 Qu’est-ce que la lumière? \n", 325 | "1 Où sommes-nous? \n", 326 | "2 D'où venons-nous? \n", 327 | "3 Que ferions-nous sans elle ? \n", 328 | "4 Quel est le groupe autochtone principal sur l’... " 329 | ] 330 | }, 331 | "execution_count": 9, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "df = df.loc[(df.english_text.str.len() < MAX_SEQ_LEN) & (df.french_text.str.len() < MAX_SEQ_LEN)]\n", 338 | "df.reset_index(inplace=True, drop=True)\n", 339 | "df.head()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 10, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "(25522, 6381)" 351 | ] 352 | }, 353 | "execution_count": 10, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "train_df = df.sample(frac=0.8, random_state=42)\n", 360 | "valid_df = df.iloc[~df.index.isin(train_df.index)]\n", 361 | "len(train_df), len(valid_df)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "### Custom DataBlock API code" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "**Why am I creating my own collate function?**\n", 376 | "\n", 377 | "Two reasons. First, I want to be able to pad both the inputs and targets (the default `pad_collate` function for `TextList` instances only takes are of the inputs). Second, I wanted to set things up so I could include my targets in training for \"teacher forcing\" as well as the actual, non-padded, lenghts of each document along with their corresponding masks for masking.\n", 378 | "\n", 379 | "Writing your own `collate_fn` is a nice way to pass in whatever you need/want to your model's `forward()` function." 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 11, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "def seq2seq_pad_collate(samples:BatchSamples, pad_idx:int=1, pad_first:bool=False, \n", 389 | " include_targets=True, include_lengths=True, include_masks=True,\n", 390 | " backwards:bool=False) -> Tuple[LongTensor, LongTensor]:\n", 391 | " \n", 392 | " \"Function that collect samples and adds padding. Flips token order if needed\"\n", 393 | " \n", 394 | " samples = to_data(samples)\n", 395 | " samples.sort(key=lambda x: len(x[0]), reverse=True)\n", 396 | " \n", 397 | " x_lens = [len(s[0]) for s in samples]\n", 398 | " x_max_len = max(x_lens)\n", 399 | " x_res = torch.zeros(len(samples), x_max_len).long() + pad_idx\n", 400 | " \n", 401 | " y_lens = [len(s[1]) for s in samples]\n", 402 | " y_max_len = max(y_lens)\n", 403 | " y_res = torch.zeros(len(samples), y_max_len).long() + pad_idx\n", 404 | " \n", 405 | " if backwards: pad_first = not pad_first\n", 406 | " \n", 407 | " for i,s in enumerate(samples):\n", 408 | " if pad_first: \n", 409 | " x_res[i,-len(s[0]):] = LongTensor(s[0])\n", 410 | " y_res[i,-len(s[1]):] = LongTensor(s[1])\n", 411 | " else: \n", 412 | " x_res[i,:len(s[0]):] = LongTensor(s[0])\n", 413 | " y_res[i,:len(s[1]):] = LongTensor(s[1])\n", 414 | " \n", 415 | " if backwards: res = res.flip(1)\n", 416 | " \n", 417 | " x = [x_res]\n", 418 | " if (include_targets): x += [y_res.clone()]\n", 419 | " if (include_lengths): x += [torch.tensor(x_lens), torch.tensor(y_lens)]\n", 420 | " if (include_masks): x += [x_res != pad_idx, y_res != pad_idx]\n", 421 | " \n", 422 | " return x, y_res" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "**Why am I creating a custom DataBunch class?**\n", 430 | "\n", 431 | "Because the default class for `TextList` doesn't allow me to override the `collate_fn`. Everything else pretty much comes right from the framework." 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 12, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "class Seq2SeqDataBunch(DataBunch):\n", 441 | "\n", 442 | " @classmethod\n", 443 | " def create(cls, train_ds, valid_ds, test_ds=None, \n", 444 | " path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1, pad_first=False, dl_tfms=None, \n", 445 | " device:torch.device=None, no_check:bool=False, backwards:bool=False, **dl_kwargs) -> DataBunch:\n", 446 | " \n", 447 | " \"\"\"Function that transform the `datasets` in a `DataBunch` for classification. \n", 448 | " Passes `**dl_kwargs` on to `DataLoader()`\"\"\"\n", 449 | " \n", 450 | " datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n", 451 | " val_bs = ifnone(val_bs, bs)\n", 452 | " \n", 453 | " collate_fn = partial(seq2seq_pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)\n", 454 | " \n", 455 | " train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)\n", 456 | " train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)\n", 457 | " \n", 458 | " dataloaders = [train_dl]\n", 459 | " for ds in datasets[1:]:\n", 460 | " lengths = [len(t) for t in ds.x.items]\n", 461 | " sampler = SortSampler(ds.x, key=lengths.__getitem__)\n", 462 | " dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))\n", 463 | " return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "Below is one of the reason I like the DataBlock API. I can inherit all the goodness form `TextList` while just overriding the `DataBunch` class. Very nice!" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 13, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "class Seq2SeqTextList(TextList):\n", 480 | " _bunch = Seq2SeqDataBunch" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "### Build our DataBunch" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 14, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "bs = 64\n", 497 | "val_bs = 128\n", 498 | "\n", 499 | "tok_pre_rules = [fix_html, spec_add_spaces, rm_useless_spaces]\n", 500 | "tok_post_rules = []" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 15, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "en_tok = Tokenizer(lang='en', pre_rules=tok_pre_rules, post_rules=tok_post_rules)\n", 510 | "fr_tok = Tokenizer(lang='fr', pre_rules=tok_pre_rules, post_rules=tok_post_rules)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 16, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "en_procs = [TokenizeProcessor(tokenizer=en_tok, include_bos=True, include_eos=True), \n", 520 | " NumericalizeProcessor(min_freq=1)]\n", 521 | "\n", 522 | "fr_procs = [TokenizeProcessor(tokenizer=fr_tok,include_bos=False, include_eos=True), \n", 523 | " NumericalizeProcessor(min_freq=1)]" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 17, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "en_train_il = Seq2SeqTextList.from_df(train_df, path=PATH, cols=['english_text'], processor=en_procs).process()\n", 533 | "\n", 534 | "en_valid_il = Seq2SeqTextList.from_df(valid_df, path=PATH, cols=['english_text'], \n", 535 | " processor=en_train_il.processor).process()" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 18, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "fr_train_il = Seq2SeqTextList.from_df(train_df, path=PATH, cols=['french_text'], processor=fr_procs).process()\n", 545 | "\n", 546 | "fr_valid_il = Seq2SeqTextList.from_df(valid_df, path=PATH, cols=['french_text'], \n", 547 | " processor=fr_train_il.processor).process()" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "Remember: A `LabelList` is a PyTorch `Dataset`" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 19, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "trn_ll = LabelList(fr_train_il, en_train_il)\n", 564 | "val_ll = LabelList(fr_valid_il, en_valid_il)\n", 565 | "\n", 566 | "lls = LabelLists(PATH, train=trn_ll, valid=val_ll)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 20, 572 | "metadata": { 573 | "scrolled": true 574 | }, 575 | "outputs": [ 576 | { 577 | "data": { 578 | "text/plain": [ 579 | "(15972, 15972, 13416, 13416)" 580 | ] 581 | }, 582 | "execution_count": 20, 583 | "metadata": {}, 584 | "output_type": "execute_result" 585 | } 586 | ], 587 | "source": [ 588 | "len(lls.train.x.vocab.itos), len(lls.valid.x.vocab.itos), len(lls.train.y.vocab.itos), len(lls.valid.y.vocab.itos)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 21, 594 | "metadata": { 595 | "scrolled": false 596 | }, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "Text Quelle est la situation actuelle de la DGRH ? xxeos" 602 | ] 603 | }, 604 | "execution_count": 21, 605 | "metadata": {}, 606 | "output_type": "execute_result" 607 | } 608 | ], 609 | "source": [ 610 | "lls.train.x[0]" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 22, 616 | "metadata": { 617 | "scrolled": true 618 | }, 619 | "outputs": [ 620 | { 621 | "data": { 622 | "text/plain": [ 623 | "Text xxbos What is the current state of affairs in the HRB ? xxeos" 624 | ] 625 | }, 626 | "execution_count": 22, 627 | "metadata": {}, 628 | "output_type": "execute_result" 629 | } 630 | ], 631 | "source": [ 632 | "lls.train.y[0]" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 23, 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "name": "stdout", 642 | "output_type": "stream", 643 | "text": [ 644 | "['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxmaj', 'xxup', 'xxrep', 'xxwrep', '?']\n", 645 | "['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxmaj', 'xxup', 'xxrep', 'xxwrep', '?']\n" 646 | ] 647 | } 648 | ], 649 | "source": [ 650 | "print(list(lls.train.x.vocab.itos[:10]))\n", 651 | "print(list(lls.train.y.vocab.itos[:10]))" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "### Creating a DataBunch via the API" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 24, 664 | "metadata": {}, 665 | "outputs": [], 666 | "source": [ 667 | "data = lls.databunch(bs=bs, val_bs=val_bs)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 25, 673 | "metadata": { 674 | "scrolled": true 675 | }, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "(torch.Size([64, 57]),\n", 681 | " torch.Size([64, 25]),\n", 682 | " torch.Size([64]),\n", 683 | " torch.Size([64]),\n", 684 | " torch.Size([64, 57]),\n", 685 | " torch.Size([64, 25]),\n", 686 | " torch.Size([64, 25]))" 687 | ] 688 | }, 689 | "execution_count": 25, 690 | "metadata": {}, 691 | "output_type": "execute_result" 692 | } 693 | ], 694 | "source": [ 695 | "b = next(iter(data.train_dl))\n", 696 | "b[0][0].shape, b[0][1].shape, b[0][2].shape, b[0][3].shape, b[0][4].shape, b[0][5].shape, b[1].shape" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "### Creating DataBunch via DataLoaders" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "train_sampler = SortishSampler(lls.train.x, key=lambda t: len(lls.train[t][0].data), bs=bs//2)\n", 713 | "valid_sampler = SortSampler(lls.valid.x, key=lambda t: len(lls.valid[t][0].data))" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "train_dl = DataLoader(lls.train, batch_size=bs, sampler=train_sampler, drop_last=True)\n", 723 | "valid_dl = DataLoader(lls.valid, batch_size=val_bs, sampler=valid_sampler)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "data = DataBunch(train_dl=train_dl, valid_dl=valid_dl, collate_fn=seq2seq_pad_collate)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "b = next(iter(data.train_dl))\n", 742 | "b[0][0].shape, b[0][1].shape, b[0][2].shape, b[0][3].shape, b[0][4].shape, b[0][5].shape, b[1].shape" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "## Training time" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 26, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "def seq2seq_loss(inputs, targets):\n", 759 | " #pdb.set_trace()\n", 760 | " src_bs, src_seq_len, nc = inputs.shape\n", 761 | " trg_bs, trg_seq_len = targets.shape\n", 762 | " \n", 763 | " if trg_seq_len > src_seq_len: inputs = F.pad(inputs, (0,0,0,trg_seq_len - src_seq_len, 0,0))\n", 764 | "\n", 765 | " inputs = inputs[:, :trg_seq_len]\n", 766 | " return F.cross_entropy(inputs.contiguous().view(-1, nc), targets.contiguous().view(-1))" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 27, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "class BasicSeq2SeqRnn(nn.Module):\n", 776 | " def __init__(self, src_vocab_sz, src_emb_dim, trg_vocab_sz, trg_emb_dim, n_hidden=256, n_layers=2,\n", 777 | " max_trg_sl=255, trg_PAD_tok_idx=1, trg_BOS_tok_idx=2, trg_EOS_tok_idx=3):\n", 778 | " super().__init__()\n", 779 | " \n", 780 | " self.n_layers, self.n_hidden, self.max_trg_sl = n_layers, n_hidden, max_trg_sl\n", 781 | " \n", 782 | " self.trg_PAD_tok_idx = trg_PAD_tok_idx\n", 783 | " self.trg_BOS_tok_idx = trg_BOS_tok_idx\n", 784 | " self.trg_EOS_tok_idx = trg_EOS_tok_idx\n", 785 | " \n", 786 | " # setup the encoder\n", 787 | " self.enc_emb = nn.Embedding(src_vocab_sz, src_emb_dim)\n", 788 | " self.enc_emb_drop = nn.Dropout(0.15)\n", 789 | " self.enc_rnn = nn.GRU(src_emb_dim, n_hidden, num_layers=n_layers, dropout=0.25, \n", 790 | " batch_first=True, bidirectional=True)\n", 791 | " self.enc_out = nn.Linear(n_hidden*2, trg_emb_dim, bias=False)\n", 792 | " \n", 793 | " # setup the decoder\n", 794 | " self.dec_emb = nn.Embedding(trg_vocab_sz, trg_emb_dim)\n", 795 | " self.dec_rnn = nn.GRU(trg_emb_dim, trg_emb_dim, num_layers=n_layers, dropout=0.1, batch_first=True)\n", 796 | " \n", 797 | " self.dec_out_drop = nn.Dropout(0.15)\n", 798 | " self.dec_out = nn.Linear(trg_emb_dim, trg_vocab_sz, bias=False)\n", 799 | " self.dec_out.weight.data = self.dec_emb.weight.data\n", 800 | " \n", 801 | " def forward(self, inputs, targets=None, \n", 802 | " src_lengths=None, trg_lengths=None, src_mask=None, trg_mask=None):\n", 803 | " \n", 804 | " bs, seq_len = inputs.shape\n", 805 | " h = self.init_hidden(bs).to(device=inputs.device)\n", 806 | " \n", 807 | " enc_emb = self.enc_emb_drop(self.enc_emb(inputs))\n", 808 | " enc_outputs, h = self.enc_rnn(enc_emb, h)\n", 809 | " h = h.view(2,2,bs,-1).permute(0,2,1,3).contiguous().view(2,bs,-1)\n", 810 | " h = self.enc_out(h)\n", 811 | " \n", 812 | " trg_max_seq = targets.shape[1] if not targets is None else self.max_trg_sl\n", 813 | " dec_inp = torch.Tensor(bs).fill_(self.trg_BOS_tok_idx).long().to(device=inputs.device)\n", 814 | " \n", 815 | " res = []\n", 816 | " for i in range(trg_max_seq):\n", 817 | " dec_emb = self.dec_emb(dec_inp).unsqueeze(1)\n", 818 | " dec_outputs, h = self.dec_rnn(dec_emb, h)\n", 819 | " dec_out = self.dec_out(self.dec_out_drop(dec_outputs.squeeze()))\n", 820 | " \n", 821 | " res.append(dec_out)\n", 822 | " dec_inp = dec_out.data.max(1)[1]\n", 823 | " \n", 824 | " if ((dec_inp == self.trg_PAD_tok_idx).all()): break\n", 825 | " if (not targets is None): dec_inp = targets[:, i] \n", 826 | " \n", 827 | " res = torch.stack(res, dim=1)\n", 828 | " return res\n", 829 | " \n", 830 | " def init_hidden(self, bs):\n", 831 | " return torch.zeros(self.n_layers*2, bs, self.n_hidden) # e.g., (2,64,256)\n", 832 | " \n", 833 | " def reset(self):\n", 834 | " pass" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 28, 840 | "metadata": {}, 841 | "outputs": [], 842 | "source": [ 843 | "from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": 29, 849 | "metadata": {}, 850 | "outputs": [], 851 | "source": [ 852 | "class AttnSeq2SeqRnn(nn.Module):\n", 853 | " def __init__(self, src_vocab_sz, src_emb_dim, trg_vocab_sz, trg_emb_dim, n_hidden=256, n_layers=2,\n", 854 | " max_trg_sl=255, trg_PAD_tok_idx=1, trg_BOS_tok_idx=2, trg_EOS_tok_idx=3):\n", 855 | " super().__init__()\n", 856 | " \n", 857 | " self.n_layers, self.n_hidden, self.max_trg_sl = n_layers, n_hidden, max_trg_sl\n", 858 | " \n", 859 | " self.trg_PAD_tok_idx = trg_PAD_tok_idx\n", 860 | " self.trg_BOS_tok_idx = trg_BOS_tok_idx\n", 861 | " self.trg_EOS_tok_idx = trg_EOS_tok_idx\n", 862 | " \n", 863 | " # setup the encoder\n", 864 | " self.enc_emb = nn.Embedding(src_vocab_sz, src_emb_dim)\n", 865 | " self.enc_emb_drop = nn.Dropout(0.15)\n", 866 | " self.enc_rnn = nn.GRU(src_emb_dim, n_hidden, num_layers=n_layers, dropout=0.25, \n", 867 | " batch_first=True, bidirectional=True)\n", 868 | " self.enc_out = nn.Linear(n_hidden*2, trg_emb_dim, bias=False)\n", 869 | " \n", 870 | " # setup the decoder\n", 871 | " self.dec_emb = nn.Embedding(trg_vocab_sz, trg_emb_dim)\n", 872 | " self.dec_rnn = nn.GRU(trg_emb_dim + 2*n_hidden, trg_emb_dim, num_layers=n_layers, dropout=0.1, \n", 873 | " batch_first=True)\n", 874 | " \n", 875 | " self.dec_out_drop = nn.Dropout(0.15)\n", 876 | " self.dec_out = nn.Linear(trg_emb_dim, trg_vocab_sz, bias=False)\n", 877 | " self.dec_out.weight.data = self.dec_emb.weight.data\n", 878 | " \n", 879 | " # setup attention\n", 880 | " self.key_layer = nn.Linear(n_hidden*2, n_hidden, bias=False)\n", 881 | " self.query_layer = nn.Linear(trg_emb_dim, n_hidden, bias=False)\n", 882 | " self.energy_layer = nn.Linear(n_hidden, 1, bias=False)\n", 883 | " \n", 884 | " def forward(self, inputs, targets=None, \n", 885 | " src_lengths=None, trg_lengths=None, src_mask=None, trg_mask=None, return_attns=False):\n", 886 | " \n", 887 | " bs, seq_len = inputs.shape\n", 888 | " h = self.init_hidden(bs).to(device=inputs.device)\n", 889 | " \n", 890 | " enc_emb = self.enc_emb_drop(self.enc_emb(inputs))\n", 891 | " packed = pack_padded_sequence(enc_emb, src_lengths, batch_first=True)\n", 892 | " \n", 893 | " enc_outputs, h = self.enc_rnn(packed, h)\n", 894 | " enc_outputs, _ = pad_packed_sequence(enc_outputs, batch_first=True)\n", 895 | " \n", 896 | " h = h.view(self.n_layers,2,bs,-1).permute(0,2,1,3).contiguous().view(self.n_layers,bs,-1)\n", 897 | " h = self.enc_out(h)\n", 898 | " \n", 899 | " trg_max_seq = targets.shape[1] if not targets is None else self.max_trg_sl\n", 900 | " dec_inp = torch.Tensor(bs).fill_(self.trg_BOS_tok_idx).long().to(device=inputs.device)\n", 901 | " \n", 902 | " attn_key = self.key_layer(enc_outputs) # [64, 12, 512] => [64, 12, 256]\n", 903 | " \n", 904 | " res = []\n", 905 | " attns = []\n", 906 | " for i in range(trg_max_seq):\n", 907 | " dec_emb = self.dec_emb(dec_inp).unsqueeze(1)\n", 908 | " \n", 909 | " # attn\n", 910 | " attn_query = self.query_layer(h[-1].unsqueeze(1)) # [2, 64, 300] => [64, 1, 256]\n", 911 | " attn_scores = self.energy_layer(torch.tanh(attn_query + attn_key)) # => [64, 12, 1]\n", 912 | " attn_scores = attn_scores.squeeze(2).unsqueeze(1) # => [64, 1, 12]\n", 913 | " \n", 914 | " attn_scores.masked_fill_(src_mask.unsqueeze(1) == 0, -float('inf'))\n", 915 | " \n", 916 | " attn_probs = F.softmax(attn_scores, dim=-1)\n", 917 | " attns.append(attn_probs.squeeze())\n", 918 | " \n", 919 | " context = torch.bmm(attn_probs, enc_outputs) # [64, 1, 12] * [64, 12, 512] => [64, 1, 512]\n", 920 | " \n", 921 | " rnn_inp = torch.cat([dec_emb, context], dim=2)\n", 922 | " dec_outputs, h = self.dec_rnn(rnn_inp, h)\n", 923 | " dec_out = self.dec_out(self.dec_out_drop(dec_outputs.squeeze()))\n", 924 | " \n", 925 | " res.append(dec_out)\n", 926 | " dec_inp = dec_out.data.max(1)[1]\n", 927 | " \n", 928 | " if ((dec_inp == self.trg_PAD_tok_idx).all()): break\n", 929 | " if (not targets is None): dec_inp = targets[:, i] \n", 930 | " \n", 931 | " res = torch.stack(res, dim=1)\n", 932 | " \n", 933 | " if (return_attns): return res, torch.stack(attns, dim=1)\n", 934 | " return res\n", 935 | " \n", 936 | " def init_hidden(self, bs):\n", 937 | " return torch.zeros(self.n_layers*2, bs, self.n_hidden) # e.g., (2,64,256)\n", 938 | " \n", 939 | " def reset(self):\n", 940 | " pass" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 30, 946 | "metadata": {}, 947 | "outputs": [], 948 | "source": [ 949 | "nh, nl = 512, 2\n", 950 | "src_emb_dim, trg_em_dim = 300, 300\n", 951 | "\n", 952 | "src_vocab_sz = len(data.valid_ds.x.vocab.itos)\n", 953 | "trg_vocab_sz = len(data.valid_ds.y.vocab.itos)\n", 954 | "\n", 955 | "trg_BOS_idx = data.valid_ds.y.vocab.stoi['xxbos']\n", 956 | "trg_EOS_idx = data.valid_ds.y.vocab.stoi['xxeos']\n", 957 | "trg_PAD_idx = data.valid_ds.y.vocab.stoi['xxpad']" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": {}, 963 | "source": [ 964 | "Another method of implementing \"Teacher Forcing\" via fastai is through callbacks. I'm not using it here since I created my own `collate_fn` method above, but here's a link to a forum post discussing it for your consideration: [teacher forcing via callbacks](https://forums.fast.ai/t/teacher-forcing/29415/3)." 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": 31, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [ 973 | "# one method to include targets for teacher forcing is to include it via a callback ... we assume the model\n", 974 | "# has a \"self.targets\" attribute for such use\n", 975 | "class TeacherForcingCallback(LearnerCallback):\n", 976 | " learn:Learner\n", 977 | " \n", 978 | " def on_batch_begin(self, train, **kwargs):\n", 979 | " learn.model.targets = kwargs['last_target']" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 32, 985 | "metadata": {}, 986 | "outputs": [], 987 | "source": [ 988 | "model = AttnSeq2SeqRnn(src_vocab_sz, src_emb_dim, trg_vocab_sz, trg_em_dim, n_hidden=nh, n_layers=nl,\n", 989 | " trg_PAD_tok_idx=trg_PAD_idx, trg_BOS_tok_idx=trg_BOS_idx, trg_EOS_tok_idx=trg_EOS_idx)\n", 990 | "\n", 991 | "learn = None, gc.collect()\n", 992 | "learn = Learner(data, model)\n", 993 | "learn.loss_func = seq2seq_loss\n", 994 | "\n", 995 | "# learn.callbacks.append(TeacherForcingCallback(learn))" 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": 33, 1001 | "metadata": {}, 1002 | "outputs": [ 1003 | { 1004 | "data": { 1005 | "text/html": [], 1006 | "text/plain": [ 1007 | "" 1008 | ] 1009 | }, 1010 | "metadata": {}, 1011 | "output_type": "display_data" 1012 | }, 1013 | { 1014 | "name": "stdout", 1015 | "output_type": "stream", 1016 | "text": [ 1017 | "LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n" 1018 | ] 1019 | }, 1020 | { 1021 | "data": { 1022 | "image/png": "\n", 1023 | "text/plain": [ 1024 | "
" 1025 | ] 1026 | }, 1027 | "metadata": { 1028 | "needs_background": "light" 1029 | }, 1030 | "output_type": "display_data" 1031 | } 1032 | ], 1033 | "source": [ 1034 | "learn.lr_find(start_lr=1e-8, end_lr=1e-1)\n", 1035 | "learn.recorder.plot()" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 34, 1041 | "metadata": {}, 1042 | "outputs": [ 1043 | { 1044 | "data": { 1045 | "text/html": [ 1046 | "Total time: 02:29

\n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | "
epochtrain_lossvalid_losstime
02.8130182.58887600:29
12.3802122.29359000:29
22.0254932.05396700:29
31.6506231.91096100:30
41.3074001.91540800:29
" 1088 | ], 1089 | "text/plain": [ 1090 | "" 1091 | ] 1092 | }, 1093 | "metadata": {}, 1094 | "output_type": "display_data" 1095 | } 1096 | ], 1097 | "source": [ 1098 | "learn.fit_one_cycle(5, max_lr=3e-3, moms=(0.8,0.99))" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": 35, 1104 | "metadata": {}, 1105 | "outputs": [], 1106 | "source": [ 1107 | "it = iter(data.valid_dl)" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": 36, 1113 | "metadata": {}, 1114 | "outputs": [ 1115 | { 1116 | "data": { 1117 | "text/plain": [ 1118 | "(torch.Size([128, 30]), torch.Size([128, 26]), torch.Size([128, 26, 13416]))" 1119 | ] 1120 | }, 1121 | "execution_count": 36, 1122 | "metadata": {}, 1123 | "output_type": "execute_result" 1124 | } 1125 | ], 1126 | "source": [ 1127 | "x,y = next(it)\n", 1128 | "learn.model.eval()\n", 1129 | "probs, attns = learn.model(x[0], x[1], x[2], x[3], x[4], x[5], return_attns=True)\n", 1130 | "x[0].shape, y.shape, probs.shape" 1131 | ] 1132 | }, 1133 | { 1134 | "cell_type": "code", 1135 | "execution_count": 37, 1136 | "metadata": {}, 1137 | "outputs": [ 1138 | { 1139 | "data": { 1140 | "text/plain": [ 1141 | "torch.Size([128, 26, 30])" 1142 | ] 1143 | }, 1144 | "execution_count": 37, 1145 | "metadata": {}, 1146 | "output_type": "execute_result" 1147 | } 1148 | ], 1149 | "source": [ 1150 | "attns.shape" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": 38, 1156 | "metadata": {}, 1157 | "outputs": [ 1158 | { 1159 | "data": { 1160 | "text/plain": [ 1161 | "torch.Size([128, 26])" 1162 | ] 1163 | }, 1164 | "execution_count": 38, 1165 | "metadata": {}, 1166 | "output_type": "execute_result" 1167 | } 1168 | ], 1169 | "source": [ 1170 | "preds = probs.max(2)[1]\n", 1171 | "preds.shape" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "code", 1176 | "execution_count": 39, 1177 | "metadata": {}, 1178 | "outputs": [ 1179 | { 1180 | "name": "stdout", 1181 | "output_type": "stream", 1182 | "text": [ 1183 | "source: Qu' est -ce qui rend la vie agréable à _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ces jours - ci ? xxeos\n", 1184 | "actual: xxbos What 's good about living in _ _ _ _ _ _ _ _ _ _ these days ? xxeos \n", 1185 | "pred: xxbos What is in about this in these _ _ _ _ _ _ _ _ _ _ _ _ xxeos \n", 1186 | "\n", 1187 | "source: Quels   sont   les   deux   signaux   à   bras   uniquement   pour   les   xxunk   à   xxunk   ? xxeos \n", 1188 | "actual: xxbos Which   2   hand   signals   are   for   xxunk   xxunk   only ? xxeos \n", 1189 | "pred: xxbos What                     the the ? ? be ? be xxeos \n", 1190 | "\n", 1191 | "source: Quel est le lien entre : [ nom du répondant ] [ ( xxunk , âge ) ] et : [ nom du répondant ] ? xxeos \n", 1192 | "actual: xxbos What is the relationship of : [ respondent name ] [ ( Sex , Age ) ] to : [ respondent name ] ? xxeos\n", 1193 | "pred: xxbos What is the name between the the between between of ? ] ] ] and ) of ? the the ? ] ? ? xxeos\n", 1194 | "\n", 1195 | "source: Si le travailleur est debout , l’ objet à soulever xxunk - t - il moins de 23 kg ( 51 xxunk ) ? xxeos \n", 1196 | "actual: xxbos When standing , is the weight of the object lifted less than 23 kg / 51 xxunk ? xxeos \n", 1197 | "pred: xxbos What is does does the purpose of a following ( ? ? a ? ? ? ? ? xxeos \n", 1198 | "\n", 1199 | "source: Qu’ en sera - t - il de ma vie , de nos vies , dans un avenir proche et à long terme ? xxeos \n", 1200 | "actual: xxbos What will be my life , our lives , in the foreseeable future and in the long term ? xxeos \n", 1201 | "pred: xxbos What will be our , , in , in in the future and and the a next - ? xxeos \n", 1202 | "\n", 1203 | "source: Qu' arrive - t - il si l' auteur est sans le xxunk et si j' ai fabriqué le disque pour lui ? xxeos \n", 1204 | "actual: xxbos What happens if the guy is xxunk or if I manufactured the record for him ? xxeos \n", 1205 | "pred: xxbos What if if I I is not and not I not ? documents ? the ? xxeos \n", 1206 | "\n", 1207 | "source: Quand ils ont xxunk l’ école et s’ en vont à l’ université , qu’ est -ce que ça leur donne ? xxeos \n", 1208 | "actual: xxbos When they leave school and they do go to university , where are they at ? xxeos \n", 1209 | "pred: xxbos When , , , , what what to ? what ? and ? there ? ? xxeos \n", 1210 | "\n", 1211 | "source: Quand ils ont xxunk l' école et s' en vont à l' université , qu' est -ce que ça leur donne ? xxeos \n", 1212 | "actual: xxbos When they leave school and they do go to university , where are they at ? xxeos \n", 1213 | "pred: xxbos When , , , , what what to ? what ? and ? there ? ? xxeos \n", 1214 | "\n", 1215 | "source: Quel rôle le thème de la « traçabilité » joue - t - il dans les Etats membres à cet égard ? xxeos \n", 1216 | "actual: xxbos What role does \" xxunk \" play in this regard in member states ? xxeos \n", 1217 | "pred: xxbos What role does the play play play in the regard ? the ? ? xxeos \n", 1218 | "\n", 1219 | "source: Qu' est -ce que c' est que ça vient à voir que ça xxunk 1 000 000 ou 200 000 000 ? xxeos \n", 1220 | "actual: xxbos What difference does it make whether it xxunk xxunk or xxunk ? xxeos \n", 1221 | "pred: xxbos What does would it might that or or or and do do xxeos \n", 1222 | "\n", 1223 | "source: Qu’ a - t - on donné à Horus qui l’ xxunk à la dignité de roi de toute l’ Égypte ? xxeos \n", 1224 | "actual: xxbos What was Horus given that elevated him to the position of king of all Egypt ? xxeos \n", 1225 | "pred: xxbos What was the to to the can to find best of the ? the ? ? xxeos \n", 1226 | "\n", 1227 | "source: Que se passe -t -il dans le cas de mon enfant qui a 25 ans et qui xxunk à temps plein ? xxeos \n", 1228 | "actual: xxbos What about my child who is 25 and in school full time ? xxeos \n", 1229 | "pred: xxbos What if my child 's has required year the a ? ? ? xxeos \n", 1230 | "\n", 1231 | "source: Où est -ce que l' entretien devrait se dérouler ( à la maison , au bureau , dans un café ) ? xxeos \n", 1232 | "actual: xxbos Where should the interview be held ? xxeos \n", 1233 | "pred: xxbos Where should the I be in in xxeos \n", 1234 | "\n", 1235 | "source: Que voulez - vous dire par « décomprimer les fichiers » à l' étape d ) de la question 6 ? xxeos \n", 1236 | "actual: xxbos What do you mean by \" expand the files \" in step d of Question 6 ? xxeos \n", 1237 | "pred: xxbos What do you mean by \" building \" \" \" of the to \" the ? ? xxeos \n", 1238 | "\n", 1239 | "source: Qu’ arrive - t - il aux fonds qui sont reçus pour le projet et qui ne sont pas utilisés ? xxeos \n", 1240 | "actual: xxbos What happens to project funds that are received but not used ? xxeos \n", 1241 | "pred: xxbos What happens if the if for the people and and and ? xxeos \n", 1242 | "\n", 1243 | "source: Qu’ est -ce qui est exigé , qu’ est -ce qui est xxunk , qu’ est -ce qui est privilégié ? xxeos \n", 1244 | "actual: xxbos What is required , what is desired , what is preferred ? xxeos \n", 1245 | "pred: xxbos What is it , what is the ? for is it ? xxeos \n", 1246 | "\n", 1247 | "source: Quel pourcentage s' attend à ne pas avoir d' enfants , à en avoir un , deux , ou plus ? xxeos \n", 1248 | "actual: xxbos What percentage expect to have no children , one child , two children or more than two ? xxeos \n", 1249 | "pred: xxbos What percentage of of be , a or or or or or or or not groups ? ? xxeos \n", 1250 | "\n", 1251 | "source: Qu' adviendra - t - il de ma pension de la fonction publique si je n' ai pas de survivants ? xxeos \n", 1252 | "actual: xxbos What happens to my Public Service Superannuation Pension if I have no survivors ? xxeos \n", 1253 | "pred: xxbos What will if my my Service if if if you not not documents ? xxeos \n", 1254 | "\n", 1255 | "source: Comment appelle - t - on ce qu’ on peut voir , qui ressemble à des xxunk ( condensation ) ? xxeos \n", 1256 | "actual: xxbos What is this appearance of xxunk called ( condensation ) ? xxeos \n", 1257 | "pred: xxbos What is it ( ( the , , life ) ? xxeos \n", 1258 | "\n", 1259 | "source: Quand devrait - il être déposé ( après 3 , 5 ou 7 ans ou à d' autres moments ) ? xxeos \n", 1260 | "actual: xxbos When would it be due ( after 3 / 5 / 7 years ; other ) ? xxeos \n", 1261 | "pred: xxbos When should you be the to or , , years years years years ? ? ? ? xxeos \n", 1262 | "\n", 1263 | "source: Qu’ arrive - t - il si j’ oublie quelque chose à bord de l’ avion ou à l’ aéroport ? xxeos \n", 1264 | "actual: xxbos What if I forget something on the airplane or in the airport ? xxeos \n", 1265 | "pred: xxbos What happens I am to to the or or the the Canadian ? xxeos \n", 1266 | "\n", 1267 | "source: Quelle est le véritable sens de xxunk ( « xxunk à bord » ou « fret à bord » ) ? xxeos \n", 1268 | "actual: xxbos What does xxunk ( for \" free on board \" or \" freight on board \" ) actually mean ? xxeos \n", 1269 | "pred: xxbos What is the \" \" or or ) \" ) ) \" ) \" the \" ? ? ? ? xxeos \n", 1270 | "\n", 1271 | "source: Qu’ arrive - t - il si je m’ xxunk , mais que je ne peux pas participer au programme ? xxeos \n", 1272 | "actual: xxbos What if I register and can not attend ? xxeos \n", 1273 | "pred: xxbos What if I do to how I participate the xxeos \n", 1274 | "\n", 1275 | "source: Qu' arrive - t - il à ma pension si je retourne 
", 1276 | " à l' emploi de la fonction publique ? xxeos \n", 1277 | "actual: xxbos What happens to my pension if I rejoin the Public Service ? xxeos \n", 1278 | "pred: xxbos What if if my Public if I am from Public Service ? xxeos \n", 1279 | "\n", 1280 | "source: Qu' arrive - t - il si les enfants ne sont pas assis dans un siège d' xxunk pour enfants ? xxeos \n", 1281 | "actual: xxbos What is the safest seat for my child ? xxeos \n", 1282 | "pred: xxbos What if the if if if a children ? xxeos \n", 1283 | "\n", 1284 | "source: Qu’ arrivera - t - il si le xxunk de l’ Atlantique est ajouté à la liste de la LEP ? xxeos \n", 1285 | "actual: xxbos What will happen if Atlantic xxunk is added to the SARA List ? xxeos \n", 1286 | "pred: xxbos What will happen if the Line is added to the SARA List ? xxeos \n", 1287 | "\n", 1288 | "source: Qu' arrive - t - il si l' affichage ne contient pas les détails et ressemble à ce qui suit ? xxeos \n", 1289 | "actual: xxbos What happens if there are no details xxunk and your window looks like this ? xxeos \n", 1290 | "pred: xxbos What happens if the is no most in and the program ? ? ? ? xxeos \n", 1291 | "\n", 1292 | "source: Qu' est -ce que la carte M / A / N [ M / A / N Map ] ? xxeos \n", 1293 | "actual: xxbos What is the M / A / xxunk Map ? xxeos \n", 1294 | "pred: xxbos What is / / / / or or use use xxeos \n", 1295 | "\n", 1296 | "source: Qu' en serait - il si ce projet de loi traitait non pas des sénateurs , mais des juges ? xxeos \n", 1297 | "actual: xxbos What if this bill xxunk not of senators , but of judges ? xxeos \n", 1298 | "pred: xxbos What about the bill is be be the ? if ? the ? xxeos \n", 1299 | "\n", 1300 | "source: Quel organisme serait le plus apte dans ce rôle , Téléfilm , la CCC , la BDC , EDC ? xxeos \n", 1301 | "actual: xxbos What agency would make the most sense - Telefilm , CCC , BDC , EDC ? xxeos \n", 1302 | "pred: xxbos What is do you a more effective of disease in in in in in in ? xxeos \n", 1303 | "\n", 1304 | "source: Qu' entend - on par « données - échantillon ( 20 % ) » dans le contexte du Recensement ? xxeos \n", 1305 | "actual: xxbos What does 20 % sample data mean in the Census ? xxeos \n", 1306 | "pred: xxbos What is a in in in in ? the context of xxeos \n", 1307 | "\n", 1308 | "source: Que dois - je faire si je n' ai pas accès au site Web de déclaration ( xxunk ) ? xxeos \n", 1309 | "actual: xxbos What do I do if I can not access the electronic data reporting ( EDR ) web site ? xxeos \n", 1310 | "pred: xxbos What do I do if I have not have to tax amount ? ? criterion ) ? ? ? xxeos \n", 1311 | "\n", 1312 | "source: Pourquoi R & D pour la défense Canada donne - t - elle tant de place à la collaboration ? xxeos \n", 1313 | "actual: xxbos Why does Defence R&D Canada place so much importance on collaboration ? xxeos \n", 1314 | "pred: xxbos Why did Canada Canada Canada mean to much ? ? ? ? xxeos \n", 1315 | "\n", 1316 | "source: Qu’ entend - on par « xxunk acceptable d’ études , de formation et / ou d’ expérience » ? xxeos \n", 1317 | "actual: xxbos What is \" an acceptable combination of education , training and / or experience \" ? xxeos \n", 1318 | "pred: xxbos What is a training training , , training or or and or or why ? ? xxeos \n", 1319 | "\n", 1320 | "source: Quels sont les objectifs gà © nà © xxunk du Canada dans les nà © gociations sur l' AGCS ? xxeos \n", 1321 | "actual: xxbos What are Canada 's objectives in the GATS negotiations ? xxeos \n", 1322 | "pred: xxbos What are the objectives objectives for the application ? ? xxeos \n", 1323 | "\n", 1324 | "source: Qu’ a - t - il besoin d’ apprendre pour être capable d’ utiliser la langue à ces fins ? xxeos \n", 1325 | "actual: xxbos What will they need to learn in order to [ do what they want ] ? xxeos \n", 1326 | "pred: xxbos What do be need to be to the to be these ] ] ] to ? xxeos \n", 1327 | "\n", 1328 | "source: Que peut - il arriver à une tortue luth si elle mange un ballon ou un sac d' épicerie ? xxeos \n", 1329 | "actual: xxbos What can happen to a leatherback turtle if it eats a balloon or a grocery bag ? xxeos \n", 1330 | "pred: xxbos What if be if a if if if a is or priority or priority priority ? ? xxeos \n", 1331 | "\n", 1332 | "source: Quel rôle l’ enseignant se donne - t - il dans la mise en œuvre d’ une telle démarche ? xxeos \n", 1333 | "actual: xxbos What is the teacher ’s role in pursuing such aims ? xxeos \n", 1334 | "pred: xxbos What role the role of role in the ? a ? xxeos \n", 1335 | "\n", 1336 | "source: < B > xxunk : < / B > Quand le service CASCADE sera - t - il disponible ? xxeos \n", 1337 | "actual: xxbos When will the CASCADE service be available ? xxeos \n", 1338 | "pred: xxbos What will the new be be available ? xxeos \n", 1339 | "\n", 1340 | "source: Qu' en pensez - vous et comment le Canada contribuera - t - il au renouvellement de l' OTAN ? xxeos \n", 1341 | "actual: xxbos What are your views on this and how will Canada contribute to NATO 's renewal ? xxeos \n", 1342 | "pred: xxbos What is the opinion on the and how do they be to the ? ? ? xxeos \n", 1343 | "\n", 1344 | "source: Pourquoi le père pense - t - il que Jimmy est xxunk quand l’ ours entre dans leur camp ? xxeos \n", 1345 | "actual: xxbos Why does Father think Jimmy is xxunk when the bear enters their camp ? xxeos \n", 1346 | "pred: xxbos Why does the when the in the in I EU ? ? ? ? xxeos \n", 1347 | "\n", 1348 | "source: En quoi l' emploi s' est - il accru au sein du groupe cible par rapport au groupe témoin ? xxeos \n", 1349 | "actual: xxbos What is the increase in employment of the target group compared to the control group ? xxeos \n", 1350 | "pred: xxbos What is the group for the for the group group in in the exhibition ? ? xxeos \n", 1351 | "\n", 1352 | "source: À qui dois - je m' adresser si j' ai une plainte en matière de droits de la personne ? xxeos \n", 1353 | "actual: xxbos Who should I contact if I have a human rights complaint ? xxeos \n", 1354 | "pred: xxbos Who do I contact if I have a person rights ? ? xxeos \n", 1355 | "\n", 1356 | "source: À titre d’ xxunk , d’ un point de vue éthique , que diriez - vous à ces gens ? xxeos \n", 1357 | "actual: xxbos Who do you think is right ? xxeos \n", 1358 | "pred: xxbos What do you think of a to xxeos \n", 1359 | "\n", 1360 | "source: Que se passe -t -il quand le père boit Quels sont les effets de l' alcool sur le fœtus ? xxeos \n", 1361 | "actual: xxbos What Happens When the Father Drinks ? xxeos \n", 1362 | "pred: xxbos What happens when the shall of the xxeos \n", 1363 | "\n", 1364 | "source: • Le profil Bath : qu' est -ce que c' est et pourquoi devrais - je m' y intéresser ? xxeos \n", 1365 | "actual: xxbos What is it and why should I care ? xxeos \n", 1366 | "pred: xxbos What are the , why is I be ? xxeos \n", 1367 | "\n", 1368 | "source: Qu’ arrive - t - il si je ne xxunk pas ma décision d’ ici le 1er décembre 2005 ? xxeos \n", 1369 | "actual: xxbos What happens if I do n’t submit my decision by December 1 , 2005 ? xxeos \n", 1370 | "pred: xxbos What happens if I do n't find my options ? the ? ? ? ? xxeos \n", 1371 | "\n", 1372 | "source: Que risque - t - il d’ arriver si on a pas pris soin de xxunk les droits ? xxeos \n", 1373 | "actual: xxbos What might the consequences of not clearing rights be ? xxeos \n", 1374 | "pred: xxbos What happens happen general if all if if if if xxeos \n", 1375 | "\n", 1376 | "source: Par exemple , sous quel angle abordera - t - on la question des arts dans cette étude ? xxeos \n", 1377 | "actual: xxbos Where , for instance , do the arts xxunk into the study ? xxeos \n", 1378 | "pred: xxbos Where , in the , is you engagement , I the new ? xxeos \n", 1379 | "\n", 1380 | "source: Qu’ est -ce qui en est la cause ( manque de chercheurs , manque de fonds … ) ? xxeos \n", 1381 | "actual: xxbos Why ? xxeos \n", 1382 | "pred: xxbos What is xxeos \n", 1383 | "\n", 1384 | "source: Pourquoi l' ARC ne m' a - t - elle pas envoyé mon remboursement de TPS / TVH ? xxeos \n", 1385 | "actual: xxbos Why is the CRA holding my GST / HST refund ? xxeos \n", 1386 | "pred: xxbos Why does the GST GST GST GST / HST credit ? xxeos \n", 1387 | "\n", 1388 | "source: Qu' arrive - t - il à la rupture d' un mariage ou d' une union de fait ? xxeos \n", 1389 | "actual: xxbos What would happen if there was a breakdown of a marriage or a common - law partnership ? xxeos \n", 1390 | "pred: xxbos What happens happen if a is a permit or a joint or a Corporate ? Corporate ? ? xxeos \n", 1391 | "\n", 1392 | "source: Que voulons - nous réaliser , à long terme , en ce qui concerne la gestion des pêches ? xxeos \n", 1393 | "actual: xxbos What do we want to achieve in fisheries management over the long term ? xxeos \n", 1394 | "pred: xxbos What do we want to know in the , of the next - ? xxeos \n", 1395 | "\n", 1396 | "source: Qu' arrive t il si ma banque n' est pas disposée à m' offrir un fonds de roulement ? xxeos \n", 1397 | "actual: xxbos What if my bank is not willing to provide Working Capital ? xxeos \n", 1398 | "pred: xxbos What happens my child is not a to be a ? ? xxeos \n", 1399 | "\n", 1400 | "source: Quelle expérience a -t -on du travail et de la vie dans les sites de mise en oeuvre ? xxeos \n", 1401 | "actual: xxbos What is the experience of working and living at the implementation sites ? xxeos \n", 1402 | "pred: xxbos What was the difference and the and the in the time of ? xxeos \n", 1403 | "\n", 1404 | "source: Que va - t - il se passer quand j' arrête ou je xxunk l' usage du tabac ? xxeos \n", 1405 | "actual: xxbos What 's going to happen when I quit or cut back ? xxeos \n", 1406 | "pred: xxbos What happens when when happen when I do ? capture ? ? xxeos \n", 1407 | "\n", 1408 | "source: Qu’ est -ce qui aurait pu être fait durant l’ évaluation pour vous mettre plus à l’ aise ? xxeos \n", 1409 | "actual: xxbos What could have been done in the evaluation to make it more comfortable for you ? xxeos \n", 1410 | "pred: xxbos What would have been done to this most ? address the ? ? ? ? ? xxeos \n", 1411 | "\n", 1412 | "source: Pourquoi est -ce entre 1997 et 2003 que le coût des soins de santé a augmenté le plus ? xxeos \n", 1413 | "actual: xxbos Why the larger increase in health care costs between 1997 and 2003 ? xxeos \n", 1414 | "pred: xxbos Why is the - - the care and ? the and the ? xxeos \n", 1415 | "\n", 1416 | "source: Quels sont les pays qui ont adhéré à l’ UE depuis la création de la xxunk en 1998 ? xxeos \n", 1417 | "actual: xxbos Which countries have joined the EU since the ECB was established in 1998 ? xxeos \n", 1418 | "pred: xxbos Which countries have been the EU in the EU of ? ? the ? xxeos \n", 1419 | "\n", 1420 | "source: Qui dà © termine qu' une espèce est  « en pà © ril  » au Canada ? xxeos \n", 1421 | "actual: xxbos Who decides which species are \" at risk \" in Canada ? xxeos \n", 1422 | "pred: xxbos Who is a a in in in risk \" ? the ? xxeos \n", 1423 | "\n", 1424 | "source: Pourquoi ces policiers étaient - ils présents dans le « fer à cheval » cette soirée - là ? xxeos \n", 1425 | "actual: xxbos Why were these police officers present at the \" horseshoe \" on that evening ? xxeos \n", 1426 | "pred: xxbos Why were these \" that that in the present present \" ? the ? ? xxeos \n", 1427 | "\n", 1428 | "source: Qu' est -ce que les xxunk aux doigts du roi xxunk xxunk ont à voir avec les trains ? xxeos \n", 1429 | "actual: xxbos What do King xxunk xxunk 's xxunk xxunk have to do with trains ? xxeos \n", 1430 | "pred: xxbos What do the have with have production with be been be with the ? xxeos \n", 1431 | "\n", 1432 | "source: Quelle idée , opinion ou point de vue cette chanson exprime - t - elle sur cette question ? xxeos \n", 1433 | "actual: xxbos What insight , opinion or point of view does the song express about this issue ? xxeos \n", 1434 | "pred: xxbos What opinion , opinion , opinion of the does this Commission have about this issue ? xxeos \n", 1435 | "\n", 1436 | "source: Et dans l' ensemble , quelle opinion croyez - vous qu' elle avait de l' xxunk - maître ? xxeos \n", 1437 | "actual: xxbos What do you think over - all , what her opinion was of the xxunk ? xxeos \n", 1438 | "pred: xxbos What about you think about the the the in was ’s of the the best use xxeos \n", 1439 | "\n", 1440 | "source: Que s’ est il passé au cours de ces sept années et xxunk pour que cela se produise ? xxeos \n", 1441 | "actual: xxbos What occurred in those 7 - 1 / 2 years to let this happen ? xxeos \n", 1442 | "pred: xxbos What happened do this have and these and these and ? this ? ? ? xxeos \n", 1443 | "\n", 1444 | "source: Que faut - il faire en cas de contact xxunk direct avec des xxunk à base de BPC ? xxeos \n", 1445 | "actual: xxbos What would happen if I accidentally came into direct contact with a PCB fluid ? xxeos \n", 1446 | "pred: xxbos What if be to I do to to the a ? the risk Weather ? xxeos \n", 1447 | "\n", 1448 | "source: Pour quelles raisons utilise - t - on ce terme plutôt que celui d ' « égalité » ? xxeos \n", 1449 | "actual: xxbos What is the reason for the use of the concept of \" equity \" rather than of \" equality \" ? xxeos \n", 1450 | "pred: xxbos What is the term for the key of the \" of the not \" ? ? the the under \" ? xxeos \n", 1451 | "\n", 1452 | "source: Qu' arrive - il si je ne xxunk pas d' alcool , de drogues ou d' autres substances ? xxeos \n", 1453 | "actual: xxbos What if I do n't use alcohol or illicit drugs ? xxeos \n", 1454 | "pred: xxbos What if I do n't have or or a ? ? xxeos \n", 1455 | "\n", 1456 | "source: xxunk , pourquoi as - tu de longues xxunk et une grosse bosse de muscles sur le dos ? xxeos \n", 1457 | "actual: xxbos Why do xxunk have such big xxunk and xxunk xxunk muscles ? xxeos \n", 1458 | "pred: xxbos What do you you you a and and why institutions be ? xxeos \n", 1459 | "\n", 1460 | "source: A -t -on élaboré un plan pour les biens essentiels à la mission ayant un cycle de vie ? xxeos \n", 1461 | "actual: xxbos What progress has been made to identify these assets and their operational cost ? xxeos \n", 1462 | "pred: xxbos What is are been made to date - ? ? how country ? ? xxeos \n", 1463 | "\n", 1464 | "source: Comment faire si je perds ou si j' oublie mon numéro d' xxunk et mon mot de passe ? xxeos \n", 1465 | "actual: xxbos What if I forget or xxunk my User ID and password ? xxeos \n", 1466 | "pred: xxbos What if I do my my my child ? ? my ? xxeos \n", 1467 | "\n", 1468 | "source: Pourquoi recueille - t - on si peu d' information sur les enfants de moins de 15 ans ? xxeos \n", 1469 | "actual: xxbos Why is there so little information collected about people under 15 ? xxeos \n", 1470 | "pred: xxbos Why are Information so information Information on ? children ? the - xxeos \n", 1471 | "\n", 1472 | "source: Sommes - nous capables - en tant qu' êtres humains - de vivre en paix avec les autres ? xxeos \n", 1473 | "actual: xxbos What does it mean to live in xxunk with nature ? xxeos \n", 1474 | "pred: xxbos What are it do to be in the other a of xxeos \n", 1475 | "\n", 1476 | "source: Quelle a à © tà © la dà © xxunk du Tribunal du commerce international à ce sujet ? xxeos \n", 1477 | "actual: xxbos What was the CIT ’s ruling on this ? xxeos \n", 1478 | "pred: xxbos What was the last of last of this issue xxeos \n", 1479 | "\n", 1480 | "source: Comment cela xxunk - t - il la capacité de l’ océan à agir comme puits de xxunk ? xxeos \n", 1481 | "actual: xxbos Which plankton species are at risk ? xxeos \n", 1482 | "pred: xxbos What is is is to risk of xxeos \n", 1483 | "\n", 1484 | "source: Pourquoi ne pas , d' entrée de jeu , soumettre de bons projets de lois à la Chambre ? xxeos \n", 1485 | "actual: xxbos Why not just bring good legislation to the House in the first place ? xxeos \n", 1486 | "pred: xxbos Why not human human to to to the old of the safety of ? xxeos \n", 1487 | "\n", 1488 | "source: Par le choix de ces couleurs , sur quoi l' artiste a -t -il voulu mettre l' accent ? xxeos \n", 1489 | "actual: xxbos What has been emphasized using these colours ? xxeos \n", 1490 | "pred: xxbos What is been the on the on ? xxeos \n", 1491 | "\n", 1492 | "source: Quel est le nombre total de personnes de 0 à 14 ans , de sexes masculin et féminin ? xxeos \n", 1493 | "actual: xxbos What is the total number of males and females 15–19 years of age ? xxeos \n", 1494 | "pred: xxbos What is the total total of age and age years years ? age ? xxeos \n", 1495 | "\n", 1496 | "source: Que peut - on et doit - on faire en premier lieu pour éviter le problème des réfugiés ? xxeos \n", 1497 | "actual: xxbos What can and should be done to stop people becoming refugees in the first place ? xxeos \n", 1498 | "pred: xxbos What can be do be done to avoid the with the ? the next place ? xxeos \n", 1499 | "\n", 1500 | "source: Selon vous , quel impact le xxunk a -t -il sur l' industrie du porc au Manitoba ? xxeos \n", 1501 | "actual: xxbos What do you see as the impact of xxunk on the xxunk industry in Manitoba ? xxeos \n", 1502 | "pred: xxbos What do you see as the impact on the on the term program ? the ? xxeos \n", 1503 | "\n", 1504 | "source: Qu’ arrivera - t - il si le gouvernement ne change pas pendant que la population change ? xxeos \n", 1505 | "actual: xxbos What will happen if government does not change while the population does ? xxeos \n", 1506 | "pred: xxbos What will happen if the do not change in the population ? not xxeos \n", 1507 | "\n", 1508 | "source: Avez - vous d’ autres idées qui pourraient nous aider à mieux utiliser le budget du DPP ? xxeos \n", 1509 | "actual: xxbos What other ideas do you have to make maximum use of the PLR budget ? xxeos \n", 1510 | "pred: xxbos What other ideas do you have to use the use of the budget budget ? xxeos \n", 1511 | "\n", 1512 | "source: Qu’ est‑ce qui est exigé , qu’ est‑ce qui est xxunk , qu’ est‑ce qui est privilégié ? xxeos \n", 1513 | "actual: xxbos What is required , what is desired , what is preferred ? xxeos \n", 1514 | "pred: xxbos What is the to if is the ? if is the ? xxeos \n", 1515 | "\n", 1516 | "source: Quel est le rôle du CCI s' il n' y a pas de décision financière à prendre ? xxeos \n", 1517 | "actual: xxbos What is the role of the IAB if there are no funding decisions to be made ? xxeos \n", 1518 | "pred: xxbos What is the role of the program that I is not decision ? ? the ? ? xxeos \n", 1519 | "\n", 1520 | "source: Quand les régimes enregistrés d' épargne - retraite ( REER ) doivent - ils venir à échéance ? xxeos \n", 1521 | "actual: xxbos When do you have to mature your registered retirement savings plan ( RRSP ) ? xxeos \n", 1522 | "pred: xxbos When do the think to pay a GATS ? ? ? ? up ) ? xxeos \n", 1523 | "\n", 1524 | "source: Où se trouve mon « revenu total » selon la ligne 150 de ma déclaration de revenus ? xxeos \n", 1525 | "actual: xxbos Where do I find the \" total income \" amount I reported on line 150 of my tax return ? xxeos \n", 1526 | "pred: xxbos Where do I find my income my \" \" amount \" entered to my ? ? my tax return ? xxeos \n", 1527 | "\n", 1528 | "source: Que se passe -t -il si vous avez produit le formulaire xxunk ou xxunk ( xxunk ) ? xxeos \n", 1529 | "actual: xxbos What if you filed Form xxunk or xxunk ( Seniors ) ? xxeos \n", 1530 | "pred: xxbos What happens you have the had profit profit I jeopardy ) ? xxeos \n", 1531 | "\n", 1532 | "source: Pourquoi la banque n’ a - t - elle pas retenu assez d’ impôt à la source ? xxeos \n", 1533 | "actual: xxbos Why did the bank not withhold enough tax ? xxeos \n", 1534 | "pred: xxbos Why does n’t not not not the ? ? xxeos \n", 1535 | "\n", 1536 | "source: Pourquoi la banque n' a - t - elle pas retenu assez d' impôt à la source ? xxeos \n", 1537 | "actual: xxbos Why did the bank not withhold enough tax ? xxeos \n", 1538 | "pred: xxbos Why does n't not not take the to ? xxeos \n", 1539 | "\n", 1540 | "source: Qu’ est -ce qui a bien marché et qu’ est -ce qui n’ a pas bien marché ? xxeos \n", 1541 | "actual: xxbos What has worked well , what has n’t worked well ? xxeos \n", 1542 | "pred: xxbos What has been had ? what is been been and ? xxeos \n", 1543 | "\n", 1544 | "source: Quel est le nom du ( ou des ) xxunk ) que vous preniez à cette époque ? xxeos \n", 1545 | "actual: xxbos What was the name of the medicine ( s ) you were taking at that time ? xxeos \n", 1546 | "pred: xxbos What is the name of the name - or ) or that that this this , ? xxeos \n", 1547 | "\n", 1548 | "source: Pourquoi le mercure représente - t - il un danger pour la santé et pour l' environnement ? xxeos \n", 1549 | "actual: xxbos Why is mercury considered a health and environmental problem ? xxeos \n", 1550 | "pred: xxbos Why is the a for health and how project ? xxeos \n", 1551 | "\n", 1552 | "source: Qu' est -ce que le thimérosal et pourquoi l' utilise - t - on dans les vaccins ? xxeos \n", 1553 | "actual: xxbos What is thimerosal and why is it used in vaccines ? xxeos \n", 1554 | "pred: xxbos What is self and why is it used ? ? ? xxeos \n", 1555 | "\n", 1556 | "source: Quel sera le volume ou la charge de travail ( W ) de la salle de radiographie ? xxeos \n", 1557 | "actual: xxbos What will be / is the workload ( xxunk ) of the x - ray unit ? xxeos \n", 1558 | "pred: xxbos What will the the ( the appropriate of placement ) ? the Program ? use ? ? xxeos \n", 1559 | "\n", 1560 | "source: Quels changements ou xxunk ( s’ il y a lieu ) ont été apportés à vos activités ? xxeos \n", 1561 | "actual: xxbos What changes or course xxunk , if any , have been made to your activities ? xxeos \n", 1562 | "pred: xxbos What changes has made has you if any , would been made to improve change ? xxeos \n", 1563 | "\n", 1564 | "source: À qui demanderez - vous de l’ aide et quel type d’ aide pouvez - vous demander ? xxeos \n", 1565 | "actual: xxbos Who will you ask for help and what type of help can you ask for ? xxeos \n", 1566 | "pred: xxbos Who can you ask for help and what type of help ? you ask ? ? xxeos \n", 1567 | "\n", 1568 | "source: Que leur demanderez - vous de faire : vous distraire , vous encourager , faire une marche ? xxeos \n", 1569 | "actual: xxbos What will you ask them to do - distract you , encourage you , go for a walk ? xxeos \n", 1570 | "pred: xxbos What do you do to to do that what - ? in ? ? in ? ? ? ? xxeos \n", 1571 | "\n", 1572 | "source: Comment expliquer qu' il y ait eu autant de crises d' endettement en si peu de temps ? xxeos \n", 1573 | "actual: xxbos Why have there been so many debt crises in such a short time ? xxeos \n", 1574 | "pred: xxbos What is there so so many audits ? ? the a situation - ? xxeos \n", 1575 | "\n", 1576 | "source: Que se passera - t - il si je ne peux pas trouver d’ endroit où aller ? xxeos \n", 1577 | "actual: xxbos What if I ca n’t make it xxunk ? xxeos \n", 1578 | "pred: xxbos What happens I do n't find the ? ? xxeos \n", 1579 | "\n", 1580 | "source: Quels sont les symptômes de la MCJ xxunk et comment la maladie évolue - t - elle ? xxeos \n", 1581 | "actual: xxbos What are the symptoms and disease course of xxunk CJD ? xxeos \n", 1582 | "pred: xxbos What are the symptoms of how of of the infection and xxeos \n", 1583 | "\n", 1584 | "source: Quels effets la marihuana peut - elle avoir sur la santé ( Santé Canada , 1998 ) ? xxeos \n", 1585 | "actual: xxbos What are the potential health effects of marijuana ( Health Canada , 1998 ) ? xxeos \n", 1586 | "pred: xxbos What can the health health effects of Health ( Health Canada ) and , ? xxeos \n", 1587 | "\n", 1588 | "source: Qu' est -ce qui fait en sorte qu' un virus de l' influenza de type A change ? xxeos \n", 1589 | "actual: xxbos What causes an influenza A virus to change ? xxeos \n", 1590 | "pred: xxbos What is of influenza virus or ? be ? xxeos \n", 1591 | "\n", 1592 | "source: Pourquoi le Canada a -t -il prescrit une mise à jour de la Loi sur la quarantaine ? xxeos \n", 1593 | "actual: xxbos Why did Canada xxunk an updated xxunk Act ? xxeos \n", 1594 | "pred: xxbos Why has Canada have the Yukon illegal illegal ? xxeos \n", 1595 | "\n", 1596 | "source: Qu’ arrivera - t - il si je ne prends pas tous mes médicaments contre la tuberculose ? xxeos \n", 1597 | "actual: xxbos What if I do n’t take all my TB drugs ? xxeos \n", 1598 | "pred: xxbos What if I do n’t become my of reason ? ? xxeos \n", 1599 | "\n", 1600 | "source: Qu’ arrive - t - il si une personne n’ xxunk pas à une ou plusieurs séances ? xxeos \n", 1601 | "actual: xxbos What if a person does not attend a session(s ) ? xxeos \n", 1602 | "pred: xxbos What happens a person is n’t or a or or ? xxeos \n", 1603 | "\n", 1604 | "source: • Que pourrait - il se produire si la xxunk d’ une personne au volant s’ xxunk ? xxeos \n", 1605 | "actual: xxbos What might happen if a driver of a car had his or her xxunk \" fall xxunk \" ? xxeos \n", 1606 | "pred: xxbos What if the if the person is the person - ? ? a ? ? ? ? ? ? xxeos \n", 1607 | "\n", 1608 | "source: Que faire si je n' ai pas d' ordonnance du tribunal ou d' accord écrit entre conjoints ? xxeos \n", 1609 | "actual: xxbos What if I do not have a court order or written spousal agreement ? xxeos \n", 1610 | "pred: xxbos What if I do n't or with common or or organization ? ? ? xxeos \n", 1611 | "\n", 1612 | "source: Qu’ est -ce que ces arts vous disent sur la culture des auteurs qui les ont créés ? xxeos \n", 1613 | "actual: xxbos What do these arts tell you about the culture of the xxunk who created them ? xxeos \n", 1614 | "pred: xxbos What do you you have you about the culture of the stakeholders Resource From ? ? xxeos \n", 1615 | "\n", 1616 | "source: Qu’ advient - il si les taxes ne sont pas payées ou ne sont pas xxunk payées ? xxeos \n", 1617 | "actual: xxbos What happens if the fees are not paid or not paid in full ? xxeos \n", 1618 | "pred: xxbos What if if the are are not available or the ? ? the - xxeos \n", 1619 | "\n", 1620 | "source: Où retrouve t on cette observation dans le Rapport sur l’ état du parc ou au forum ? xxeos \n", 1621 | "actual: xxbos Where is this in the State of the Park Report or at the Forum ? xxeos \n", 1622 | "pred: xxbos Where does the located the status for the public or for international the following of xxeos \n", 1623 | "\n", 1624 | "source: Dans ce cas , qui d' autre que la Syrie et l' Iran en xxunk les bénéfices ? xxeos \n", 1625 | "actual: xxbos Who but Syria and Iran would benefit from that ? xxeos \n", 1626 | "pred: xxbos Who else place in what in it from the ? xxeos \n", 1627 | "\n", 1628 | "source: Qui décide désormais de ce qui est xxunk et de ce qui est illicite dans l' xxunk ? xxeos \n", 1629 | "actual: xxbos Who now decides what is legitimate and what is illicit in Islam ? xxeos \n", 1630 | "pred: xxbos Who decides in this is this and what is this ? the ? xxeos \n", 1631 | "\n", 1632 | "source: Que pouvez - vous faire d' autre pour vos clients que la concurrence ne fait pas déjà ? xxeos \n", 1633 | "actual: xxbos What can you do for your customers that your competition is n't already doing ? xxeos \n", 1634 | "pred: xxbos What else you do to your retain do you lives is there ? ? ? xxeos \n", 1635 | "\n", 1636 | "source: Pour quelle raison la glace a -t -elle commencé à xxunk il y a 21 000 ans ? xxeos \n", 1637 | "actual: xxbos Why did the ice begin to xxunk away xxunk years ago ? xxeos \n", 1638 | "pred: xxbos Why has the patient did years obtain live ? ? ? ? xxeos \n", 1639 | "\n", 1640 | "source: Quel est le rôle du Centre des opérations du gouvernement ( COG ) lors d’ une urgence ? xxeos \n", 1641 | "actual: xxbos What is the role of the Government Operations Centre ( GOC ) during an emergency ? xxeos \n", 1642 | "pred: xxbos What is the role of the government of ( ( government ) ? the project ? xxeos \n", 1643 | "\n", 1644 | "source: Qu' arrivera - t - il à ceux qui xxunk une loi quelconque relative à la chasse ? xxeos \n", 1645 | "actual: xxbos What will happen to me if I break any of the hunting laws ? xxeos \n", 1646 | "pred: xxbos What will happen to the to the make the the the EU ? ? xxeos \n", 1647 | "\n", 1648 | "source: Qu' est -ce que le Programme d' aide aux xxunk victimes de la sécheresse ( xxunk ) ? xxeos \n", 1649 | "actual: xxbos What is the xxunk xxunk Assistance Program ( xxunk ) ? xxeos \n", 1650 | "pred: xxbos What is the National Program Program Program ( burden ) ? xxeos \n", 1651 | "\n", 1652 | "source: Qu' en est-il de l' importation ou de l' exportation des BPC en vue de leur élimination ? xxeos \n", 1653 | "actual: xxbos What about importing or exporting PCBs for disposal ? xxeos \n", 1654 | "pred: xxbos What about the that company ? ? ? ? xxeos \n", 1655 | "\n", 1656 | "source: xxunk xxunk Que faire si on n' arrive plus à accéder à un dossier pour le modifier ? xxeos \n", 1657 | "actual: xxbos What do I do if I can no longer access a xxunk to change it ? xxeos \n", 1658 | "pred: xxbos What can I do if I need n't a - to Agency to be ? ? xxeos \n", 1659 | "\n" 1660 | ] 1661 | }, 1662 | { 1663 | "name": "stdout", 1664 | "output_type": "stream", 1665 | "text": [ 1666 | "source: Que se produit - il si la technologie n' est pas disponible ou n' est pas adoptée ? xxeos \n", 1667 | "actual: xxbos What happens if the technology is not available or implemented ? xxeos \n", 1668 | "pred: xxbos What if if the is is not available or the ? xxeos \n", 1669 | "\n", 1670 | "source: Pourquoi envoyer de l’ argent à Bruxelles , pour qu’ il nous soit xxunk par la suite ? xxeos \n", 1671 | "actual: xxbos Why send the money to xxunk , and then get it back again ? xxeos \n", 1672 | "pred: xxbos Why are the future , we we what what ? the ? ? ? xxeos \n", 1673 | "\n", 1674 | "source: Pourquoi l' information météorologique n' a - t - elle pas été mise à jour xxunk xxunk ? xxeos \n", 1675 | "actual: xxbos Why has n't the weather information been updated xxunk xxunk ? xxeos \n", 1676 | "pred: xxbos Why was n’t been information been been made ? ? ? xxeos \n", 1677 | "\n", 1678 | "source: Et xxunk tout , pourquoi ´ ` ` ´ ´ ˆ mettre la province a l’ xxunk ? xxeos \n", 1679 | "actual: xxbos Why are they xxunk the province anyway ? xxeos \n", 1680 | "pred: xxbos Why , the the , parties ? ? xxeos \n", 1681 | "\n", 1682 | "source: Que dois - je faire si j’ ai plus de poissons que ma limite quotidienne à bord ? xxeos \n", 1683 | "actual: xxbos What must I do if I have more than my daily limit on board ? xxeos \n", 1684 | "pred: xxbos What do I do if I need to than my project project ? a ? xxeos \n", 1685 | "\n", 1686 | "source: L’ absence de guerre signifie - t - elle que nous sommes en paix avec les autres ? xxeos \n", 1687 | "actual: xxbos What does it mean to live in xxunk with nature ? xxeos \n", 1688 | "pred: xxbos What does it mean to be in the ? the ? xxeos \n", 1689 | "\n", 1690 | "source: Que faudra - t - il pour les intégrer à l’ équipe de collaboration en santé mentale ? xxeos \n", 1691 | "actual: xxbos What will it take to include them in the collaborative mental health team ? xxeos \n", 1692 | "pred: xxbos What does happen take to be in in the next of health ? ? xxeos \n", 1693 | "\n", 1694 | "source: Qu’ est -ce qu’ une étude des systèmes d’ emploi et que comprend - t - elle ? xxeos \n", 1695 | "actual: xxbos What is an employment systems review and what does it entail ? xxeos \n", 1696 | "pred: xxbos What is a geographical and and and what is it like ? xxeos \n", 1697 | "\n", 1698 | "source: Quel genre d’ amende dois - je payer si on me surprend à boire de la bière ? xxeos \n", 1699 | "actual: xxbos What kind of fine would be given if I am caught with a bottle of beer ? xxeos \n", 1700 | "pred: xxbos What do of I do I paid to I were in ? the visa ? the ? xxeos \n", 1701 | "\n" 1702 | ] 1703 | } 1704 | ], 1705 | "source": [ 1706 | "en_vocab = learn.data.valid_ds.y.vocab\n", 1707 | "fr_vocab = learn.data.valid_ds.x.vocab\n", 1708 | "\n", 1709 | "for i in range(preds.shape[0]):\n", 1710 | " print(f'source: {fr_vocab.textify(x[0][i]).replace(\"xxpad\", \"\")}')\n", 1711 | " print(f'actual: {en_vocab.textify(y[i]).replace(\"xxpad\", \"\")}')\n", 1712 | " print(f'pred: {en_vocab.textify(preds[i]).replace(\"xxpad\", \"\")}\\n')" 1713 | ] 1714 | }, 1715 | { 1716 | "cell_type": "code", 1717 | "execution_count": 40, 1718 | "metadata": {}, 1719 | "outputs": [], 1720 | "source": [ 1721 | "def plot_attns(src, trg, scores):\n", 1722 | " fig, ax = plt.subplots()\n", 1723 | " heatmap = ax.pcolor(scores, cmap='viridis')\n", 1724 | " \n", 1725 | " ax.set_xticklabels(trg, minor=False, rotation='vertical')\n", 1726 | " ax.set_yticklabels(src, minor=False)\n", 1727 | " \n", 1728 | " # put the major ticks at the middel of each cell and the x-ticks on top\n", 1729 | " ax.xaxis.tick_top()\n", 1730 | " ax.set_xticks(np.arange(scores.shape[1]) + 0.5, minor=False)\n", 1731 | " ax.set_yticks(np.arange(scores.shape[0]) + 0.5, minor=False)\n", 1732 | " ax.invert_yaxis()\n", 1733 | " \n", 1734 | " plt.colorbar(heatmap)\n", 1735 | " plt.show()" 1736 | ] 1737 | }, 1738 | { 1739 | "cell_type": "code", 1740 | "execution_count": 41, 1741 | "metadata": {}, 1742 | "outputs": [ 1743 | { 1744 | "data": { 1745 | "image/png": "\n", 1746 | "text/plain": [ 1747 | "

" 1748 | ] 1749 | }, 1750 | "metadata": { 1751 | "needs_background": "light" 1752 | }, 1753 | "output_type": "display_data" 1754 | } 1755 | ], 1756 | "source": [ 1757 | "idx = 5\n", 1758 | "src = [ fr_vocab.itos[i] for i in x[0][idx] ]\n", 1759 | "trg = [ en_vocab.itos[i] for i in preds[idx] ]\n", 1760 | "\n", 1761 | "plot_attns(src[:15], trg[:15], attns[idx,:15,:15])" 1762 | ] 1763 | }, 1764 | { 1765 | "cell_type": "code", 1766 | "execution_count": null, 1767 | "metadata": {}, 1768 | "outputs": [], 1769 | "source": [] 1770 | }, 1771 | { 1772 | "cell_type": "code", 1773 | "execution_count": null, 1774 | "metadata": {}, 1775 | "outputs": [], 1776 | "source": [] 1777 | } 1778 | ], 1779 | "metadata": { 1780 | "kernelspec": { 1781 | "display_name": "Python 3", 1782 | "language": "python", 1783 | "name": "python3" 1784 | }, 1785 | "language_info": { 1786 | "codemirror_mode": { 1787 | "name": "ipython", 1788 | "version": 3 1789 | }, 1790 | "file_extension": ".py", 1791 | "mimetype": "text/x-python", 1792 | "name": "python", 1793 | "nbconvert_exporter": "python", 1794 | "pygments_lexer": "ipython3", 1795 | "version": "3.7.2" 1796 | } 1797 | }, 1798 | "nbformat": 4, 1799 | "nbformat_minor": 2 1800 | } 1801 | --------------------------------------------------------------------------------