├── .texpadtmp ├── publications.aux ├── thesis.aux ├── thesis.bbl ├── thesis.blg ├── thesis.dvi ├── thesis.lof ├── thesis.log ├── thesis.lot ├── thesis.out ├── thesis.synctex.gz └── thesis.toc ├── MACPdef.tex ├── Makefile ├── README.md ├── ack.tex ├── acl.bst ├── chapters ├── background.tex ├── conclusion.tex ├── copynet.tex ├── intro.tex ├── meta.tex ├── nat.tex ├── seg.tex ├── simultrans.tex ├── trainable.tex └── ulr.tex ├── declare.tex ├── dedication.tex ├── defense_v0.12.ppt ├── figs ├── background │ ├── s2s_att.pdf │ ├── transformer.pdf │ └── transformer.png ├── copynet │ ├── Picture1.pdf │ ├── all_ds1.pdf │ ├── all_sum1.pdf │ ├── all_sum1a.pdf │ ├── all_sum2a.pdf │ ├── all_sum2b.pdf │ ├── all_sum3a.pdf │ ├── all_sum3b.pdf │ ├── ds-exp-r2.pdf │ ├── ds-exp-rX.pdf │ ├── ds-exp.pdf │ ├── ds.pdf │ ├── example.pdf │ ├── intro.pdf │ ├── model-b.pdf │ ├── model-c.pdf │ ├── model-x.pdf │ ├── model.pdf │ ├── oov.pdf │ ├── summary.pdf │ ├── summary2.pdf │ ├── summary3.pdf │ ├── summary4.pdf │ ├── summaryX.pdf │ ├── summaryXX.pdf │ ├── summaryXY.pdf │ ├── summaryXY2.pdf │ ├── summaryXYZ.pdf │ ├── syn1.pdf │ └── syn2.pdf ├── hku-eps-converted-to.pdf ├── hku.eps ├── intro │ ├── corpus_size.png │ ├── nmt-history.png │ ├── real_time.jpg │ └── tower_of_babel.jpg ├── meta │ ├── curve.pdf │ ├── fi-en.pdf │ ├── framework.pdf │ ├── illust.pdf │ ├── lv-en.pdf │ ├── ro-en.pdf │ ├── support_size.pdf │ └── tr-en.pdf ├── nat │ ├── NAT-overall.pdf │ ├── NAT-sample-bleu.pdf │ ├── NAT-sub.pdf │ ├── NAT_exp.pdf │ ├── ar_nmt.pdf │ ├── examples1.pdf │ ├── fertility_example.pdf │ ├── fertility_example2.pdf │ ├── learning_curve.pdf │ └── length_vs_latency.pdf ├── seg │ ├── bleu_retrieved.pdf │ ├── cropped_depth.pdf │ ├── example-a.pdf │ ├── example-b.pdf │ ├── example-c.pdf │ ├── example-c2.pdf │ ├── example-d.pdf │ ├── example-d2.pdf │ ├── example-e.pdf │ ├── example-f.pdf │ ├── example-g.pdf │ ├── example-g2.pdf │ ├── example-g3.pdf │ ├── example_dist_train.pdf │ ├── framework1.pdf │ ├── framework2.pdf │ ├── fuzzy_improv.pdf │ ├── fuzzy_vs_num_pair.pdf │ ├── nmt1.pdf │ ├── num_pair_vs_per.pdf │ ├── picture.pdf │ └── search.pdf ├── simultrans │ ├── SNMT.pdf │ ├── crop.pdf │ ├── cropped_k.pdf │ ├── cropped_kkkk.pdf │ ├── decoding.pdf │ ├── example1.pdf │ ├── example_c.pdf │ ├── example_c0.pdf │ ├── example_c2.pdf │ ├── greedy.pdf │ ├── lr_deen_DELAY_vs_BLEU.pdf │ ├── lr_ende_DELAY_vs_BLEU.pdf │ ├── lr_enru_BLEU.pdf │ ├── lr_enru_DELAY.pdf │ ├── lr_enru_DELAY_vs_BLEU.pdf │ ├── lr_enru_WAIT.pdf │ ├── lr_enru_WAIT_vs_BLEU.pdf │ ├── lr_ruen_BLEU.pdf │ ├── lr_ruen_DELAY.pdf │ ├── lr_ruen_DELAY_vs_BLEU.pdf │ ├── lr_ruen_WAIT.pdf │ ├── sim2.pdf │ └── simultaneous.pdf ├── trainable │ ├── Concept.pdf │ ├── beam_bleu.pdf │ ├── beam_perplexity.pdf │ ├── example.pdf │ ├── framework.pdf │ ├── greedy_bleu.pdf │ ├── greedy_perplexity.pdf │ ├── lr_curve.pdf │ ├── onestep.pdf │ ├── seq1.pdf │ ├── seq2.pdf │ ├── seq3.pdf │ └── test.pdf └── ulr │ ├── cat.jpg │ ├── data_matters.pdf │ ├── data_matters2.pdf │ ├── effect_of_a.pdf │ ├── examples1.pdf │ ├── finetuning.png │ ├── missing.pdf │ ├── missing2.pdf │ ├── model2x.pdf │ ├── model3x.pdf │ ├── multi.pdf │ ├── preprojection.pdf │ ├── size.pdf │ ├── vis.pdf │ └── vis2.pdf ├── fitch.sty ├── macros.tex ├── preface.tex ├── publications.tex ├── ref.bib ├── std-macros.tex ├── std.tex ├── submit_form1.pdf ├── suthesis.sty ├── thesis.2.0.pdf ├── thesis.aux ├── thesis.bbl ├── thesis.blg ├── thesis.dvi ├── thesis.fdb_latexmk ├── thesis.fls ├── thesis.lof ├── thesis.log ├── thesis.lot ├── thesis.out ├── thesis.pdf ├── thesis.tex ├── thesis.toc ├── thesis_final_JIATAO.pdf └── tikz-dependency.sty /.texpadtmp/publications.aux: -------------------------------------------------------------------------------- 1 | \relax 2 | \providecommand\hyper@newdestlabel[2]{} 3 | \@writefile{toc}{\contentsline {chapter}{List of Publications}{179}{chapter*.204}} 4 | \@setckpt{publications}{ 5 | \setcounter{page}{181} 6 | \setcounter{equation}{0} 7 | \setcounter{enumi}{1} 8 | \setcounter{enumii}{0} 9 | \setcounter{enumiii}{0} 10 | \setcounter{enumiv}{0} 11 | \setcounter{footnote}{0} 12 | \setcounter{mpfootnote}{0} 13 | \setcounter{part}{4} 14 | \setcounter{chapter}{10} 15 | \setcounter{section}{1} 16 | \setcounter{subsection}{0} 17 | \setcounter{subsubsection}{0} 18 | \setcounter{paragraph}{0} 19 | \setcounter{subparagraph}{0} 20 | \setcounter{figure}{0} 21 | \setcounter{table}{0} 22 | \setcounter{parentequation}{0} 23 | \setcounter{r@tfl@t}{4} 24 | \setcounter{NAT@ctr}{132} 25 | \setcounter{dt@labelid}{0} 26 | \setcounter{float@type}{8} 27 | \setcounter{algorithm}{5} 28 | \setcounter{subfigure}{0} 29 | \setcounter{lofdepth}{1} 30 | \setcounter{subtable}{0} 31 | \setcounter{lotdepth}{1} 32 | \setcounter{ALG@line}{13} 33 | \setcounter{ALG@rem}{0} 34 | \setcounter{ALG@nested}{0} 35 | \setcounter{ALG@Lnr}{2} 36 | \setcounter{ALG@blocknr}{10} 37 | \setcounter{ALG@storecount}{0} 38 | \setcounter{ALG@tmpcounter}{0} 39 | \setcounter{lips@count}{0} 40 | \setcounter{ContinuedFloat}{0} 41 | \setcounter{Item}{27} 42 | \setcounter{Hfootnote}{40} 43 | \setcounter{bookmark@seq@number}{130} 44 | \setcounter{section@level}{0} 45 | } 46 | -------------------------------------------------------------------------------- /.texpadtmp/thesis.blg: -------------------------------------------------------------------------------- 1 | This is BibTeX, Version 0.99d (TeX Live 2014) 2 | Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 3 | The top-level auxiliary file: /Users/jiataogu/Dropbox/Thesis/Efficient-Neural-Machine-Translation/.texpadtmp/thesis.aux 4 | The style file: ../acl.bst 5 | A level-1 auxiliary file: publications.aux 6 | Database file #1: ../ref.bib 7 | Warning--I didn't find a database entry for "alexis2018word" 8 | Warning--empty booktitle in Zhang2005AnEP 9 | You've used 132 entries, 10 | 2781 wiz_defined-function locations, 11 | 1175 strings with 29999 characters, 12 | and the built_in function-call counts, 88602 in all, are: 13 | = -- 6655 14 | > -- 5717 15 | < -- 53 16 | + -- 1907 17 | - -- 1735 18 | * -- 6807 19 | := -- 14053 20 | add.period$ -- 555 21 | call.type$ -- 132 22 | change.case$ -- 1320 23 | chr.to.int$ -- 120 24 | cite$ -- 133 25 | duplicate$ -- 6597 26 | empty$ -- 5453 27 | format.name$ -- 2153 28 | if$ -- 17307 29 | int.to.chr$ -- 13 30 | int.to.str$ -- 0 31 | missing$ -- 1307 32 | newline$ -- 663 33 | num.names$ -- 530 34 | pop$ -- 2978 35 | preamble$ -- 1 36 | purify$ -- 1103 37 | quote$ -- 0 38 | skip$ -- 2228 39 | stack$ -- 0 40 | substring$ -- 2683 41 | swap$ -- 3039 42 | text.length$ -- 28 43 | text.prefix$ -- 0 44 | top$ -- 0 45 | type$ -- 1188 46 | warning$ -- 1 47 | while$ -- 498 48 | width$ -- 0 49 | write$ -- 1645 50 | (There were 2 warnings) 51 | -------------------------------------------------------------------------------- /.texpadtmp/thesis.dvi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/.texpadtmp/thesis.dvi -------------------------------------------------------------------------------- /.texpadtmp/thesis.lof: -------------------------------------------------------------------------------- 1 | \addvspace {10\p@ } 2 | \contentsline {figure}{\numberline {1.1}{\ignorespaces The aim in building Babel and a Tower to “reach into heaven” was to prevent the people from being “scattered abroad over the face of the whole earth” (Genesis 11:4). \relax }}{4}{figure.caption.8} 3 | \contentsline {figure}{\numberline {1.2}{\ignorespaces A comparison between NMT and SMT given varying sizes of training examples. Curves extracted from \citet {koehn2017six}.\relax }}{6}{figure.caption.13} 4 | \contentsline {figure}{\numberline {1.3}{\ignorespaces The goal of the efficient NMT decoding -- simultaneous translation.\relax }}{7}{figure.caption.14} 5 | \addvspace {10\p@ } 6 | \contentsline {figure}{\numberline {2.1}{\ignorespaces An illustration of the comparison between the conventional {{\textsc {Seq2Seq}}}\xspace learning and {{\textsc {Seq2Seq}}}\xspace with attention mechanism for translating ``A B C D $\rightarrow $ X Y Z ''.\relax }}{14}{figure.caption.18} 7 | \contentsline {figure}{\numberline {2.2}{\ignorespaces A flow-chart of the Transformer model from \citet {vaswani2017attention}. \relax }}{16}{figure.caption.19} 8 | \addvspace {10\p@ } 9 | \contentsline {figure}{\numberline {3.1}{\ignorespaces The overall diagram of {\textsc {CopyNet}}\xspace . For simplicity, we omit some links for prediction. \relax }}{26}{figure.caption.24} 10 | \contentsline {figure}{\numberline {3.2}{\ignorespaces The illustration of the decoding probability $p(y_t|\cdot )$ as a 4-class classifier. \relax }}{28}{figure.caption.27} 11 | \contentsline {figure}{\numberline {3.3}{\ignorespaces Example output of {\textsc {CopyNet}}\xspace on the synthetic dataset. The heatmap represents the activations of the copy-mode over the input sequence (left) during the decoding process (bottom).\relax }}{33}{figure.caption.33} 12 | \contentsline {figure}{\numberline {3.4}{\ignorespaces Examples of {\textsc {CopyNet}}\xspace on LCSTS compared with RNN context. Word segmentation is applied on the input, where underlined are OOV words. The highlighted words (with different colors) are those words with copy-mode probability higher than the generate-mode. We also provide literal English translation for the document, the golden, and {\textsc {CopyNet}}\xspace , while omitting that for RNN context since the language is broken.\relax }}{35}{figure.caption.38} 13 | \contentsline {figure}{\numberline {3.5}{\ignorespaces Examples on the testing set of DS-II shown as the input text and golden, with the outputs of RNNSearch and CopyNet. Words in red rectangles are unseen in the training set. The highlighted words (with different colors) are those words with copy-mode probability higher than the generate-mode. Green cirles (meaning correct) and red cross (meaning incorrect) are given based on human judgment on whether the response is appropriate. \relax }}{37}{figure.caption.42} 14 | \addvspace {10\p@ } 15 | \contentsline {figure}{\numberline {4.1}{\ignorespaces An illustration of the proposed search-engine guided non-parametric neural machine translation.\relax }}{42}{figure.caption.45} 16 | \contentsline {figure}{\numberline {4.2}{\ignorespaces The overall architecture of the proposed SEG-NMT. The shaded box includes the module which handles a set of translation pairs retrieved in the first stage. The heat maps represent the attention scores between the source sentences (left-to-right) and the corresponding translations (top-to-bottom).\relax }}{44}{figure.caption.46} 17 | \contentsline {figure}{\numberline {4.3}{\ignorespaces The improvement over the baseline by SEG-NMT on Fr$\to $En w.r.t. the fuzzy matching scores of one retrieved translation pair. \relax }}{53}{figure.caption.60} 18 | \contentsline {figure}{\numberline {4.4}{\ignorespaces The BLEU scores on Fr$\to $En using varying numbers of retrieved translation pairs during testing. The model was trained once. ``Adaptive'' refers to the proposed greedy selection in Alg.\nobreakspace {}\ref {cp4.algo.alg1}. \relax }}{53}{figure.caption.61} 19 | \contentsline {figure}{\numberline {4.5}{\ignorespaces Three examples from the Fr$\to $En test set. For the proposed SEG-NMT model, one translation pair is retrieved from the training set. Each token in the translation by the proposed approach and its corresponded token (if it exists) in the retrieved pair are shaded in blue according to the gating variable $\zeta _t$ from Eq.\nobreakspace {}\textup {\hbox {\mathsurround \z@ \normalfont (\ignorespaces \ref {cp4.eq.shallow}\unskip \@@italiccorr )}}. In all, we show: (S) the source sentence. (RS) the source side of a retrieved pair. (RT) the target side of the retrieved pair. (A) the translation by the proposed approach. (B) the translation by the baseline. (T) the reference translation. \relax }}{55}{figure.caption.65} 20 | \addvspace {10\p@ } 21 | \contentsline {figure}{\numberline {5.1}{\ignorespaces BLEU scores reported on the test set for Ro-En. The amount of training data effects the translation performance dramatically using a single NMT model.\relax }}{60}{figure.caption.67} 22 | \contentsline {figure}{\numberline {5.2}{\ignorespaces word ``cat'' in different languages\relax }}{62}{figure.caption.69} 23 | \contentsline {figure}{\numberline {5.3}{\ignorespaces An illustration of the proposed architecture of ULR and MoLE. Shaded parts are trained within the NMT model while unshaded parts are not changed during training.\relax }}{63}{figure.caption.71} 24 | \contentsline {figure}{\numberline {5.4}{\ignorespaces An illustration of projecting multiple monolingual embeddings (Es, Fr, It, Pt, Ro) to the same universal (En) space.\relax }}{66}{figure.caption.74} 25 | \contentsline {figure}{\numberline {5.5}{\ignorespaces BLEU score vs corpus size\relax }}{72}{figure.caption.86} 26 | \contentsline {figure}{\numberline {5.6}{\ignorespaces BLEU score vs unknown tokens\relax }}{72}{figure.caption.87} 27 | \contentsline {figure}{\numberline {5.7}{\ignorespaces Three sets of examples on Ro-En translation with variant settings. \relax }}{75}{figure.caption.91} 28 | \contentsline {figure}{\numberline {5.8}{\ignorespaces The activation visualization of mixture of language experts module on one randomly selected Ro source sentences trained together with different auxiliary languages. Darker color means higher activation score. \relax }}{76}{figure.caption.92} 29 | \addvspace {10\p@ } 30 | \contentsline {figure}{\numberline {6.1}{\ignorespaces The graphical illustration of the training process of the proposed MetaNMT. For each episode, one task (language pair) is sampled for meta-learning. The boxes and arrows in blue are mainly involved in language-specific learning (\textsection \ref {cp6.sec.lsl}), and those in purple in meta-learning (\textsection \ref {cp6.sec.ml}).\relax }}{82}{figure.caption.97} 31 | \contentsline {figure}{\numberline {6.2}{\ignorespaces An intuitive illustration in which we use solid lines to represent the learning of initialization, and dashed lines to show the path of fine-tuning.\relax }}{85}{figure.6.2} 32 | \contentsline {figure}{\numberline {6.3}{\ignorespaces BLEU scores reported on test sets for \{Ro, Lv, Fi, Tr\} to En, where each model is first learned from 6 source tasks (Es, Fr, It, Pt, De, Ru) and then fine-tuned on randomly sampled training sets with around 16,000 English tokens per run. The error bars show the standard deviation calculated from 5 runs.\relax }}{90}{figure.caption.113} 33 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {Ro-En}}}{90}{figure.caption.113} 34 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {Lv-En}}}{90}{figure.caption.113} 35 | \contentsline {subfigure}{\numberline {(c)}{\ignorespaces {Fi-En}}}{90}{figure.caption.113} 36 | \contentsline {subfigure}{\numberline {(d)}{\ignorespaces {Tr-En}}}{90}{figure.caption.113} 37 | \contentsline {figure}{\numberline {6.4}{\ignorespaces BLEU Scores w.r.t. the size of the target task's training set.\relax }}{91}{figure.caption.116} 38 | \contentsline {figure}{\numberline {6.5}{\ignorespaces The learning curves of BLEU scores on the validation task (Ro-En).\relax }}{93}{figure.caption.118} 39 | \addvspace {10\p@ } 40 | \contentsline {figure}{\numberline {7.1}{\ignorespaces The graphical illustration of the proposal trainable greedy decoding. $\theta $, $\phi $ and $\psi $ respectively correspond to the parameters of the underlying neural machine translation model, the trainable greedy decoding (actor) and the critic. \relax }}{102}{figure.caption.124} 41 | \contentsline {figure}{\numberline {7.2}{\ignorespaces Graphical illustrations of the trainable greedy decoding. The left panel shows a single step of the actor interacting with the underlying neural translation model, and The right panel the interaction among the underlying neural translation system (dashed-border boxes), actor (red-border boxes), and critic (blue-border boxes). The solid arrows indicate the forward pass, and the dashed yellow arrows the actor's backward pass. The dotted-border box shows the use of a reference translation.\relax }}{107}{figure.caption.129} 42 | \contentsline {figure}{\numberline {7.3}{\ignorespaces The plots draw the improvements by the trainable greedy decoding on the test set.}}{112}{figure.caption.136} 43 | \contentsline {figure}{\numberline {7.4}{\ignorespaces Comparison of greedy BLEU scores whether using the critic-aware exploration or not on Ru-En Dataset. The green line means the BLEU score achieved by greedy decoding from the underlying NMT model.\relax }}{113}{figure.caption.137} 44 | \contentsline {figure}{\numberline {7.5}{\ignorespaces Three Ru-En examples in which the difference between the trainable greedy decoding (A) and the conventional greedy decoding (G) is large. Each step is marked with magenta, when the actor significantly influenced the output distribution.\relax }}{113}{figure.caption.138} 45 | \addvspace {10\p@ } 46 | \contentsline {figure}{\numberline {8.1}{\ignorespaces Translating ``A B C'' to ``X Y'' using autoregressive and non-autoregressive neural MT architectures. The latter generates all output tokens in parallel.\relax }}{118}{figure.caption.141} 47 | \contentsline {figure}{\numberline {8.2}{\ignorespaces The architecture of the NAT{}, where the black solid arrows represent differentiable connections and the purple dashed arrows are non-differentiable operations. Each sublayer inside the encoder and decoder stacks also includes layer normalization and a residual connection.\relax }}{120}{figure.caption.144} 48 | \contentsline {figure}{\numberline {8.3}{\ignorespaces BLEU scores on IWSLT development set as a function of sample size for noisy parallel decoding. NPD matches the performance of the other two decoding strategies after two samples, and exceeds the performance of the autoregressive teacher with around 1000.\relax }}{130}{figure.caption.158} 49 | \contentsline {figure}{\numberline {8.4}{\ignorespaces Two examples comparing translations produced by an autoregressive (AR) and non-autoregressive Transformer as well as the result of noisy parallel decoding with sample size 100. Repeated words are highlighted in gray.\relax }}{133}{figure.caption.168} 50 | \contentsline {figure}{\numberline {8.5}{\ignorespaces A Romanian--English example translated with noisy parallel decoding. At left are eight sampled fertility sequences from the encoder, represented with their corresponding decoder input sequences. Each of these values for the latent variable leads to a different possible output translation, shown at right. The autoregressive Transformer then picks the best translation, shown in red, a process which is much faster than directly using it to generate output.\relax }}{134}{figure.caption.169} 51 | \contentsline {figure}{\numberline {8.6}{\ignorespaces The translation latency, computed as the time to decode a single sentence without minibatching, for each sentence in the IWSLT development set as a function of its length. The autoregressive model has latency linear in the decoding length, while the latency of the NAT is nearly constant for typical lengths, even with NPD with sample size 10. When using NPD with sample size 100, the level of parallelism is enough to more than saturate the GPU, leading again to linear latencies.\relax }}{134}{figure.caption.170} 52 | \contentsline {figure}{\numberline {8.7}{\ignorespaces Learning curves for training and fine-tuning of the NAT{} on IWSLT. BLEU scores are on the development set.\relax }}{135}{figure.caption.171} 53 | \contentsline {figure}{\numberline {8.8}{\ignorespaces The schematic structure of training and inference for the NAT. The ``distilled data'' contains target sentences decoded by the autoregressive model and ground-truth source sentences.\relax }}{136}{figure.caption.173} 54 | \addvspace {10\p@ } 55 | \contentsline {figure}{\numberline {9.1}{\ignorespaces {Example output from the proposed framework in DE $\rightarrow $ EN simultaneous translation. The heat-map represents the soft alignment between the incoming source sentence (left, up-to-down) and the emitted translation (top, left-to-right). The length of each column represents the number of source words being waited for before emitting the translation. Best viewed when zoomed digitally.}\relax }}{138}{figure.caption.174} 56 | \contentsline {figure}{\numberline {9.2}{\ignorespaces {Illustration of the proposed framework: at each step, the NMT environment (left) computes a candidate translation. The recurrent agent (right) will the observation including the candidates and send back decisions--\textsc {read} or \textsc {write}.}\relax }}{140}{figure.caption.175} 57 | \contentsline {figure}{\numberline {9.3}{\ignorespaces {Illustrations of (A)\nobreakspace {}beam-search, (B)\nobreakspace {}simultaneous greedy decoding and (C)\nobreakspace {}simultaneous beam-search.}\relax }}{143}{figure.caption.181} 58 | \contentsline {figure}{\numberline {9.4}{\ignorespaces {Learning progress curves for variant delay targets on the validation dataset for EN $\rightarrow $ RU. Every time we only keep one target for one delay measure. For instance when using target AP, the coefficient of $\alpha $ in Eq.\nobreakspace {}\textup {\hbox {\mathsurround \z@ \normalfont (\ignorespaces \ref {cp9.eq.rd}\unskip \@@italiccorr )}} will be set $0$.}\relax }}{150}{figure.caption.190} 59 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {BLEU (EN $\rightarrow $ RU)}}}{150}{figure.caption.190} 60 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {AP (EN $\rightarrow $ RU)}}}{150}{figure.caption.190} 61 | \contentsline {subfigure}{\numberline {(c)}{\ignorespaces {CW (EN $\rightarrow $ RU)}}}{150}{figure.caption.190} 62 | \contentsline {figure}{\numberline {9.5}{\ignorespaces {Delay\nobreakspace {}(AP) v.s. BLEU for both language pair--directions. The shown point-pairs are the results of simultaneous greedy decoding and beam-search (beam-size = 5) respectively with models trained for various delay targets: ($\color {green!70!blue}\blacktriangleleft \triangleleft $: CW=$8$, $\color {green!70!blue}\blacktriangle \triangle $: CW=$5$, $\color {green!70!blue}\blacklozenge \lozenge $: CW=$2$, $\color {red} \blacktriangleright \triangleright $: AP=$0.3$, $\color {red} \blacktriangledown \triangledown $: AP=$0.5$, $\color {red} \blacksquare \square $: AP=$0.7$)}. For each target, we select the model that maximizes the quality-to-delay ratio ($\frac {\text {BLEU}}{\text {AP}}$) on the validation set. The baselines are also plotted ($\color {blue}\bigstar $: WOS $\color {black}\bigstar ${\fontfamily {pzd}\fontencoding {U}\fontseries {m}\fontshape {n}\selectfont \char 73}: WUE, $\times $: WID, $+$: WIW).\relax }}{151}{figure.caption.191} 63 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {EN$\rightarrow $RU}}}{151}{figure.caption.191} 64 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {RU$\rightarrow $EN}}}{151}{figure.caption.191} 65 | \contentsline {subfigure}{\numberline {(c)}{\ignorespaces {EN$\rightarrow $DE}}}{151}{figure.caption.191} 66 | \contentsline {subfigure}{\numberline {(d)}{\ignorespaces {DE$\rightarrow $EN}}}{151}{figure.caption.191} 67 | \contentsline {figure}{\numberline {9.6}{\ignorespaces {Delay\nobreakspace {}(CW) v.s. BLEU score for EN $\rightarrow $ RU, ($\color {green!70!blue}\blacktriangleleft \triangleleft $: CW=$8$, $\color {green!70!blue}\blacktriangle \triangle $: CW=$5$, $\color {green!70!blue}\blacklozenge \lozenge $: CW=$2$, $\color {red} \blacktriangleright \triangleright $: AP=$0.3$, $\color {red} \blacktriangledown \triangledown $: AP=$0.5$, $\color {red} \blacksquare \square $: AP=$0.7$), against the baselines\nobreakspace {}($\color {blue}\bigstar $: WOS $\color {black}\bigstar $: WUE, $+$: SEG1, $\times $: SEG2).}\relax }}{152}{figure.caption.192} 68 | \contentsline {figure}{\numberline {9.7}{\ignorespaces {Given the example input sentence (leftmost column), we show outputs by models trained for various delay targets. For these outputs, each row corresponds to one source word and represents the emitted words (maybe empty) after reading this word. The corresponding source and target words are in the same color for all model outputs.}\relax }}{154}{figure.caption.197} 69 | \contentsline {figure}{\numberline {9.8}{\ignorespaces {Comparison of DE$\rightarrow $EN examples using the proposed framework and usual NMT system respectively. Both the heatmaps share the same setting with Fig.\nobreakspace {}\ref {cp9.fig.crop}}. The verb ``gedeckt'' is incorrectly translated in simultaneous translation.\relax }}{155}{figure.caption.198} 70 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {Simultaneous Neural Machine Translation}}}{155}{figure.caption.198} 71 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {Neural Machine Translation}}}{155}{figure.caption.198} 72 | \addvspace {10\p@ } 73 | -------------------------------------------------------------------------------- /.texpadtmp/thesis.lot: -------------------------------------------------------------------------------- 1 | \addvspace {10\p@ } 2 | \addvspace {10\p@ } 3 | \addvspace {10\p@ } 4 | \contentsline {table}{\numberline {3.1}{\ignorespaces Examples of synthetic rules and examples\relax }}{31}{table.caption.30} 5 | \contentsline {table}{\numberline {3.2}{\ignorespaces The test accuracy (\%) on synthetic data.\relax }}{32}{table.caption.32} 6 | \contentsline {table}{\numberline {3.3}{\ignorespaces The statistics of the LCSTS dataset.\relax }}{33}{table.caption.35} 7 | \contentsline {table}{\numberline {3.4}{\ignorespaces Testing performance of LCSTS, where ``RNN" is canonical Enc-Dec, and ``RNN context" its attentive variant.\relax }}{34}{table.caption.37} 8 | \contentsline {table}{\numberline {3.5}{\ignorespaces The decoding accuracy on the two testing sets. Decoding is admitted success only when the answer is found exactly in the Top-K outputs. \relax }}{38}{table.caption.43} 9 | \addvspace {10\p@ } 10 | \contentsline {table}{\numberline {4.1}{\ignorespaces Statistics from the JRC-Acquis corpus. We use BPE subword symbols.\relax }}{51}{table.caption.55} 11 | \contentsline {table}{\numberline {4.2}{\ignorespaces The BLEU scores on JRC-Acquis corpus.\relax }}{52}{table.caption.58} 12 | \addvspace {10\p@ } 13 | \contentsline {table}{\numberline {5.1}{\ignorespaces Statistics of the available parallel resource for extremely low-resource languages in our experiments. All are translated to English.\relax }}{69}{table.caption.78} 14 | \contentsline {table}{\numberline {5.2}{\ignorespaces Statistics of the available parallel resource in our experiments. All the languages are translated to English.\relax }}{69}{table.caption.79} 15 | \contentsline {table}{\numberline {5.3}{\ignorespaces Scores over variant source languages (6k sentences for Ro \& Lv, and 10k for Ko). ``Multi" means the Multi-lingual NMT baseline.\relax }}{71}{table.caption.85} 16 | \contentsline {table}{\numberline {5.4}{\ignorespaces BLEU scores evaluated on test set (6k), compared with ULR and MoLE. ``vanilla" is the standard NMT system trained only on Ro-En training set\relax }}{73}{table.caption.89} 17 | \addvspace {10\p@ } 18 | \contentsline {table}{\numberline {6.1}{\ignorespaces Statistics of full datasets of the target language pairs. BLEU scores on the dev and test sets are reported from a supervised Transformer model with the same architecture.\relax }}{87}{table.caption.106} 19 | \contentsline {table}{\numberline {6.2}{\ignorespaces BLEU Scores w.r.t. the source task set for all five target tasks.\relax }}{92}{table.caption.119} 20 | \contentsline {table}{\numberline {6.3}{\ignorespaces Sample translations for Tr-En and Ko-En highlight the impact of fine-tuning which results in syntactically better formed translations. We highlight tokens of interest in terms of reordering. \relax }}{93}{table.caption.120} 21 | \addvspace {10\p@ } 22 | \addvspace {10\p@ } 23 | \contentsline {table}{\numberline {8.1}{\ignorespaces Dataset statistics (\# of sentence pairs).\relax }}{129}{table.caption.156} 24 | \contentsline {table}{\numberline {8.2}{\ignorespaces BLEU scores on official test sets (\texttt {newstest2014} for WMT En-De and \texttt {newstest2016} for WMT En-Ro) or the development set for IWSLT. NAT models without NPD use argmax decoding. Latency is computed as the time to decode a single sentence without minibatching, averaged over the whole test set; decoding is implemented in PyTorch on a single NVIDIA Tesla P100.\relax }}{131}{table.caption.165} 25 | \contentsline {table}{\numberline {8.3}{\ignorespaces Ablation performance on the IWSLT development set. BLEU (T) refers to the BLEU score on a version of the development set that has been translated by the teacher model. An $\times $ indicates that fine-tuning caused that model to get worse. When uniform copying is used as the decoder inputs, the ground-truth target lengths are provided. All models use argmax decoding.\relax }}{132}{table.caption.167} 26 | \addvspace {10\p@ } 27 | \addvspace {10\p@ } 28 | -------------------------------------------------------------------------------- /.texpadtmp/thesis.out: -------------------------------------------------------------------------------- 1 | \BOOKMARK [0][-]{chapter*.2}{Declaration}{}% 1 2 | \BOOKMARK [0][-]{chapter*.3}{Abstract}{}% 2 3 | \BOOKMARK [0][-]{chapter*.4}{Acknowledgments}{}% 3 4 | \BOOKMARK [-1][-]{part.1}{I Introduction}{}% 4 5 | \BOOKMARK [0][-]{chapter.1}{Introduction}{part.1}% 5 6 | \BOOKMARK [1][-]{section.1.1}{A Brief Review of Machine Translation \(MT\)}{chapter.1}% 6 7 | \BOOKMARK [1][-]{section.1.2}{Towards Efficient Neural Machine Translation}{chapter.1}% 7 8 | \BOOKMARK [1][-]{section.1.3}{Contributions}{chapter.1}% 8 9 | \BOOKMARK [1][-]{section.1.4}{Thesis Outline}{chapter.1}% 9 10 | \BOOKMARK [1][-]{section.1.5}{Open-source Implementation}{chapter.1}% 10 11 | \BOOKMARK [0][-]{chapter.2}{Background}{part.1}% 11 12 | \BOOKMARK [1][-]{section.2.1}{Modeling}{chapter.2}% 12 13 | \BOOKMARK [2][-]{subsection.2.1.1}{Neural Language Modeling}{section.2.1}% 13 14 | \BOOKMARK [2][-]{subsection.2.1.2}{Sequence-to-Sequence Learning}{section.2.1}% 14 15 | \BOOKMARK [2][-]{subsection.2.1.3}{Attention Mechanism}{section.2.1}% 15 16 | \BOOKMARK [2][-]{subsection.2.1.4}{Neural Machine Translation without RNNs}{section.2.1}% 16 17 | \BOOKMARK [1][-]{section.2.2}{Training}{chapter.2}% 17 18 | \BOOKMARK [2][-]{subsection.2.2.1}{Data}{section.2.2}% 18 19 | \BOOKMARK [2][-]{subsection.2.2.2}{Maximum Likelihood Learning}{section.2.2}% 19 20 | \BOOKMARK [1][-]{section.2.3}{Decoding}{chapter.2}% 20 21 | \BOOKMARK [2][-]{subsection.2.3.1}{Greedy Decoding}{section.2.3}% 21 22 | \BOOKMARK [2][-]{subsection.2.3.2}{Beam Search}{section.2.3}% 22 23 | \BOOKMARK [2][-]{subsection.2.3.3}{Noisy Parallel Decoding}{section.2.3}% 23 24 | \BOOKMARK [-1][-]{part.2}{II Data-Efficient Neural Machine Translation}{}% 24 25 | \BOOKMARK [0][-]{chapter.3}{Copying Mechanism}{part.2}% 25 26 | \BOOKMARK [1][-]{section.3.1}{Overview}{chapter.3}% 26 27 | \BOOKMARK [1][-]{section.3.2}{Model}{chapter.3}% 27 28 | \BOOKMARK [2][-]{subsection.3.2.1}{Model Overview}{section.3.2}% 28 29 | \BOOKMARK [2][-]{subsection.3.2.2}{Prediction with Copying and Generation}{section.3.2}% 29 30 | \BOOKMARK [2][-]{subsection.3.2.3}{State Update}{section.3.2}% 30 31 | \BOOKMARK [2][-]{subsection.3.2.4}{Hybrid Addressing of Short-Term Memory}{section.3.2}% 31 32 | \BOOKMARK [2][-]{subsection.3.2.5}{Learning}{section.3.2}% 32 33 | \BOOKMARK [1][-]{section.3.3}{Experiments}{chapter.3}% 33 34 | \BOOKMARK [2][-]{subsection.3.3.1}{Synthetic Dataset}{section.3.3}% 34 35 | \BOOKMARK [2][-]{subsection.3.3.2}{Text Summarization}{section.3.3}% 35 36 | \BOOKMARK [2][-]{subsection.3.3.3}{Single-turn Dialogue}{section.3.3}% 36 37 | \BOOKMARK [1][-]{section.3.4}{Related Work}{chapter.3}% 37 38 | \BOOKMARK [1][-]{section.3.5}{Conclusion and Next Chapter}{chapter.3}% 38 39 | \BOOKMARK [0][-]{chapter.4}{Non-Parametric Neural Machine Translation}{part.2}% 39 40 | \BOOKMARK [1][-]{section.4.1}{Overview}{chapter.4}% 40 41 | \BOOKMARK [2][-]{subsection.4.1.1}{Background: Translation Memory}{section.4.1}% 41 42 | \BOOKMARK [1][-]{section.4.2}{Model}{chapter.4}% 42 43 | \BOOKMARK [2][-]{subsection.4.2.1}{Retrieval Stage}{section.4.2}% 43 44 | \BOOKMARK [2][-]{subsection.4.2.2}{Translation Stage}{section.4.2}% 44 45 | \BOOKMARK [2][-]{subsection.4.2.3}{Learning and Inference}{section.4.2}% 45 46 | \BOOKMARK [1][-]{section.4.3}{Experiments}{chapter.4}% 46 47 | \BOOKMARK [2][-]{subsection.4.3.1}{Settings}{section.4.3}% 47 48 | \BOOKMARK [2][-]{subsection.4.3.2}{Result and Analysis}{section.4.3}% 48 49 | \BOOKMARK [1][-]{section.4.4}{Related Work}{chapter.4}% 49 50 | \BOOKMARK [1][-]{section.4.5}{Conclusion and Next Chapter}{chapter.4}% 50 51 | \BOOKMARK [0][-]{chapter.5}{Universal Neural Machine Translation}{part.2}% 51 52 | \BOOKMARK [1][-]{section.5.1}{Overview}{chapter.5}% 52 53 | \BOOKMARK [1][-]{section.5.2}{Background}{chapter.5}% 53 54 | \BOOKMARK [2][-]{subsection.5.2.1}{Multi-lingual NMT}{section.5.2}% 54 55 | \BOOKMARK [2][-]{subsection.5.2.2}{Challenges}{section.5.2}% 55 56 | \BOOKMARK [1][-]{section.5.3}{Model}{chapter.5}% 56 57 | \BOOKMARK [2][-]{subsection.5.3.1}{Universal Lexical Representation \(ULR\)}{section.5.3}% 57 58 | \BOOKMARK [2][-]{subsection.5.3.2}{Mixture of Language Experts \(MoLE\)}{section.5.3}% 58 59 | \BOOKMARK [1][-]{section.5.4}{Experiments}{chapter.5}% 59 60 | \BOOKMARK [2][-]{subsection.5.4.1}{Settings}{section.5.4}% 60 61 | \BOOKMARK [2][-]{subsection.5.4.2}{Back-Translation}{section.5.4}% 61 62 | \BOOKMARK [2][-]{subsection.5.4.3}{Preliminary Experiments}{section.5.4}% 62 63 | \BOOKMARK [2][-]{subsection.5.4.4}{Results}{section.5.4}% 63 64 | \BOOKMARK [2][-]{subsection.5.4.5}{Qualitative Analysis}{section.5.4}% 64 65 | \BOOKMARK [1][-]{section.5.5}{Related Work}{chapter.5}% 65 66 | \BOOKMARK [1][-]{section.5.6}{Conclusion and Next Chapter}{chapter.5}% 66 67 | \BOOKMARK [0][-]{chapter.6}{Meta Learning for Neural Machine Translation}{part.2}% 67 68 | \BOOKMARK [1][-]{section.6.1}{Overview}{chapter.6}% 68 69 | \BOOKMARK [1][-]{section.6.2}{Background}{chapter.6}% 69 70 | \BOOKMARK [1][-]{section.6.3}{Model}{chapter.6}% 70 71 | \BOOKMARK [2][-]{subsection.6.3.1}{Learn: language-specific learning}{section.6.3}% 71 72 | \BOOKMARK [2][-]{subsection.6.3.2}{MetaLearn}{section.6.3}% 72 73 | \BOOKMARK [2][-]{subsection.6.3.3}{Unified Lexical Representation}{section.6.3}% 73 74 | \BOOKMARK [1][-]{section.6.4}{Experiments}{chapter.6}% 74 75 | \BOOKMARK [2][-]{subsection.6.4.1}{Dataset}{section.6.4}% 75 76 | \BOOKMARK [2][-]{subsection.6.4.2}{Model and Learning}{section.6.4}% 76 77 | \BOOKMARK [2][-]{subsection.6.4.3}{Results}{section.6.4}% 77 78 | \BOOKMARK [1][-]{section.6.5}{Conclusion and Next Chapter}{chapter.6}% 78 79 | \BOOKMARK [-1][-]{part.3}{III Decoding-Efficient Neural Machine Translation}{}% 79 80 | \BOOKMARK [0][-]{chapter.7}{Trainable Greedy Decoding}{part.3}% 80 81 | \BOOKMARK [1][-]{section.7.1}{Overview}{chapter.7}% 81 82 | \BOOKMARK [1][-]{section.7.2}{Model}{chapter.7}% 82 83 | \BOOKMARK [2][-]{subsection.7.2.1}{Many Decoding Objectives}{section.7.2}% 83 84 | \BOOKMARK [2][-]{subsection.7.2.2}{Trainable Greedy Decoding}{section.7.2}% 84 85 | \BOOKMARK [2][-]{subsection.7.2.3}{Learning and Challenges}{section.7.2}% 85 86 | \BOOKMARK [1][-]{section.7.3}{Learning}{chapter.7}% 86 87 | \BOOKMARK [2][-]{subsection.7.3.1}{Deterministic Policy Gradient}{section.7.3}% 87 88 | \BOOKMARK [2][-]{subsection.7.3.2}{Critic-Aware Actor Learning}{section.7.3}% 88 89 | \BOOKMARK [1][-]{section.7.4}{Experiments}{chapter.7}% 89 90 | \BOOKMARK [2][-]{subsection.7.4.1}{Settings}{section.7.4}% 90 91 | \BOOKMARK [2][-]{subsection.7.4.2}{Results and Analysis}{section.7.4}% 91 92 | \BOOKMARK [1][-]{section.7.5}{Conclusion and Next Chapter}{chapter.7}% 92 93 | \BOOKMARK [0][-]{chapter.8}{Non-Autoregressive Neural Machine Translation}{part.3}% 93 94 | \BOOKMARK [1][-]{section.8.1}{Overview}{chapter.8}% 94 95 | \BOOKMARK [1][-]{section.8.2}{Background}{chapter.8}% 95 96 | \BOOKMARK [2][-]{subsection.8.2.1}{The Multimodality Problem}{section.8.2}% 96 97 | \BOOKMARK [1][-]{section.8.3}{Model}{chapter.8}% 97 98 | \BOOKMARK [2][-]{subsection.8.3.1}{Encoder Stack}{section.8.3}% 98 99 | \BOOKMARK [2][-]{subsection.8.3.2}{Decoder Stack}{section.8.3}% 99 100 | \BOOKMARK [2][-]{subsection.8.3.3}{Modeling Fertility to Tackle the Multimodality Problem}{section.8.3}% 100 101 | \BOOKMARK [2][-]{subsection.8.3.4}{Translation Predictor and the Decoding Process}{section.8.3}% 101 102 | \BOOKMARK [1][-]{section.8.4}{Learning}{chapter.8}% 102 103 | \BOOKMARK [2][-]{subsection.8.4.1}{Sequence-Level Knowledge Distillation}{section.8.4}% 103 104 | \BOOKMARK [2][-]{subsection.8.4.2}{Fine-Tuning}{section.8.4}% 104 105 | \BOOKMARK [1][-]{section.8.5}{Experiments}{chapter.8}% 105 106 | \BOOKMARK [2][-]{subsection.8.5.1}{Settings}{section.8.5}% 106 107 | \BOOKMARK [2][-]{subsection.8.5.2}{Results}{section.8.5}% 107 108 | \BOOKMARK [2][-]{subsection.8.5.3}{Analysis and Schematic}{section.8.5}% 108 109 | \BOOKMARK [1][-]{section.8.6}{Conclusion and Next Chapter}{chapter.8}% 109 110 | \BOOKMARK [0][-]{chapter.9}{Simultaneous Neural Machine Translation}{part.3}% 110 111 | \BOOKMARK [1][-]{section.9.1}{Overview}{chapter.9}% 111 112 | \BOOKMARK [1][-]{section.9.2}{Background}{chapter.9}% 112 113 | \BOOKMARK [1][-]{section.9.3}{Model}{chapter.9}% 113 114 | \BOOKMARK [2][-]{subsection.9.3.1}{Environment}{section.9.3}% 114 115 | \BOOKMARK [2][-]{subsection.9.3.2}{Agent}{section.9.3}% 115 116 | \BOOKMARK [2][-]{subsection.9.3.3}{Simultaneous Beam Search}{section.9.3}% 116 117 | \BOOKMARK [1][-]{section.9.4}{Learning}{chapter.9}% 117 118 | \BOOKMARK [2][-]{subsection.9.4.1}{Pre-training}{section.9.4}% 118 119 | \BOOKMARK [2][-]{subsection.9.4.2}{Reward Function}{section.9.4}% 119 120 | \BOOKMARK [2][-]{subsection.9.4.3}{Reinforcement Learning}{section.9.4}% 120 121 | \BOOKMARK [1][-]{section.9.5}{Experiments}{chapter.9}% 121 122 | \BOOKMARK [2][-]{subsection.9.5.1}{Settings}{section.9.5}% 122 123 | \BOOKMARK [2][-]{subsection.9.5.2}{Quantitative Analysis}{section.9.5}% 123 124 | \BOOKMARK [2][-]{subsection.9.5.3}{Qualitative Analysis}{section.9.5}% 124 125 | \BOOKMARK [1][-]{section.9.6}{Related Work}{chapter.9}% 125 126 | \BOOKMARK [1][-]{section.9.7}{Conclusion}{chapter.9}% 126 127 | \BOOKMARK [-1][-]{part.4}{IV Conclusion}{}% 127 128 | \BOOKMARK [0][-]{chapter.10}{Conclusion}{part.4}% 128 129 | \BOOKMARK [1][-]{section.10.1}{Future Work}{chapter.10}% 129 130 | \BOOKMARK [0][-]{chapter*.204}{List of Publications}{part.4}% 130 131 | -------------------------------------------------------------------------------- /.texpadtmp/thesis.synctex.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/.texpadtmp/thesis.synctex.gz -------------------------------------------------------------------------------- /MACPdef.tex: -------------------------------------------------------------------------------- 1 | % Miguel A. Carreira-Perpinan's LaTeX macros. 2 | % Time-stamp: <07/02/08 02:40:12 miguel> 3 | 4 | % Definitions that require only standard LaTeX2e things 5 | 6 | % Letters used for matrices and vectors (boldface), for functions (roman), etc. 7 | \newcommand{\A}{\ensuremath{\mathbf{A}}} 8 | \newcommand{\B}{\ensuremath{\mathbf{B}}} 9 | \newcommand{\C}{\ensuremath{\mathbf{C}}} 10 | %\newcommand{\D}{\ensuremath{\mathbf{D}}} 11 | %\newcommand{\E}{\ensuremath{\mathbf{E}}} 12 | \newcommand{\F}{\ensuremath{\mathbf{F}}} 13 | \newcommand{\G}{\ensuremath{\mathbf{G}}} 14 | \newcommand{\HH}{\ensuremath{\mathbf{H}}} 15 | \newcommand{\I}{\ensuremath{\mathbf{I}}} 16 | \newcommand{\J}{\ensuremath{\mathbf{J}}} 17 | \newcommand{\K}{\ensuremath{\mathbf{K}}} 18 | \newcommand{\LL}{\ensuremath{\mathbf{L}}} 19 | \newcommand{\M}{\ensuremath{\mathbf{M}}} 20 | \newcommand{\N}{\ensuremath{\mathbf{N}}} 21 | \newcommand{\OO}{\ensuremath{\mathbf{O}}} 22 | \newcommand{\PP}{\ensuremath{\mathbf{P}}} 23 | \newcommand{\Q}{\ensuremath{\mathbf{Q}}} 24 | %\newcommand{\R}{\ensuremath{\mathbf{R}}} % PROSPER defines \R 25 | \newcommand{\RR}{\ensuremath{\mathbf{R}}} 26 | \renewcommand{\SS}{\ensuremath{\mathbf{S}}} 27 | %\newcommand{\T}{\ensuremath{\mathbf{T}}} 28 | \newcommand{\U}{\ensuremath{\mathbf{U}}} 29 | \newcommand{\V}{\ensuremath{\mathbf{V}}} 30 | \newcommand{\W}{\ensuremath{\mathbf{W}}} 31 | \newcommand{\X}{\ensuremath{\mathbf{X}}} 32 | \newcommand{\Y}{\ensuremath{\mathbf{Y}}} 33 | %\newcommand{\Z}{\ensuremath{\mathbf{Z}}} 34 | \renewcommand{\aa}{\ensuremath{\mathbf{a}}} 35 | \renewcommand{\b}{\ensuremath{\mathbf{b}}} 36 | \renewcommand{\c}{\ensuremath{\mathbf{c}}} 37 | \newcommand{\dd}{\ensuremath{\mathbf{d}}} 38 | \newcommand{\e}{\ensuremath{\mathbf{e}}} 39 | \newcommand{\f}{\ensuremath{\mathbf{f}}} 40 | \newcommand{\g}{\ensuremath{\mathbf{g}}} 41 | \newcommand{\h}{\ensuremath{\mathbf{h}}} 42 | %\newcommand{\bk}{\ensuremath{\mathbf{k}}} 43 | %newcommand{\bl}{\ensuremath{\mathbf{l}}} 44 | \newcommand{\m}{\ensuremath{\mathbf{m}}} 45 | \newcommand{\n}{\ensuremath{\mathbf{n}}} 46 | %\newcommand{\p}{\ensuremath{\mathbf{p}}} 47 | \newcommand{\q}{\ensuremath{\mathbf{q}}} 48 | \newcommand{\rr}{\ensuremath{\mathbf{r}}} 49 | \newcommand{\sss}{\ensuremath{\mathbf{s}}} % TIPA defines \s and LaTeX \ss! 50 | \renewcommand{\t}{\ensuremath{\mathbf{t}}} 51 | \newcommand{\uu}{\ensuremath{\mathbf{u}}} 52 | \newcommand{\vv}{\ensuremath{\mathbf{v}}} 53 | %\renewcommand{\v}{\ensuremath{\mathbf{v}}} 54 | %\newcommand{\w}{\ensuremath{\mathbf{w}}} 55 | \newcommand{\x}{\ensuremath{\mathbf{x}}} 56 | \newcommand{\y}{\ensuremath{\mathbf{y}}} 57 | \newcommand{\z}{\ensuremath{\mathbf{z}}} 58 | \newcommand{\0}{\ensuremath{\mathbf{0}}} 59 | 60 | \newcommand{\bdelta}{\ensuremath{\boldsymbol{\delta}}} 61 | \newcommand{\binfty}{\ensuremath{\boldsymbol{\infty}}} 62 | \newcommand{\bkappa}{\ensuremath{\boldsymbol{\kappa}}} 63 | \newcommand{\blambda}{\ensuremath{\boldsymbol{\lambda}}} 64 | 65 | \newcommand{\bDelta}{\ensuremath{\boldsymbol{\Delta}}} 66 | \newcommand{\bLambda}{\ensuremath{\boldsymbol{\Lambda}}} 67 | \newcommand{\bPhi}{\ensuremath{\boldsymbol{\Phi}}} 68 | \newcommand{\bPi}{\ensuremath{\boldsymbol{\Pi}}} 69 | \newcommand{\bPsi}{\ensuremath{\boldsymbol{\Psi}}} 70 | \newcommand{\bSigma}{\ensuremath{\boldsymbol{\Sigma}}} 71 | \newcommand{\bTheta}{\ensuremath{\boldsymbol{\Theta}}} 72 | \newcommand{\bXi}{\ensuremath{\boldsymbol{\Xi}}} 73 | \newcommand{\bUpsilon}{\ensuremath{\boldsymbol{\Upsilon}}} 74 | 75 | % Blackboard bold 76 | \newcommand{\bbC}{\ensuremath{\mathbb{C}}} 77 | \newcommand{\bbH}{\ensuremath{\mathbb{H}}} 78 | \newcommand{\bbN}{\ensuremath{\mathbb{N}}} 79 | \newcommand{\bbR}{\ensuremath{\mathbb{R}}} 80 | \newcommand{\bbS}{\ensuremath{\mathbb{S}}} 81 | \newcommand{\bbZ}{\ensuremath{\mathbb{Z}}} 82 | 83 | % Calligraphic 84 | \newcommand{\calA}{\ensuremath{\mathcal{A}}} 85 | \newcommand{\calB}{\ensuremath{\mathcal{B}}} 86 | \newcommand{\calC}{\ensuremath{\mathcal{C}}} 87 | \newcommand{\calD}{\ensuremath{\mathcal{D}}} 88 | \newcommand{\calE}{\ensuremath{\mathcal{E}}} 89 | \newcommand{\calF}{\ensuremath{\mathcal{F}}} 90 | \newcommand{\calG}{\ensuremath{\mathcal{G}}} 91 | \newcommand{\calH}{\ensuremath{\mathcal{H}}} 92 | \newcommand{\calI}{\ensuremath{\mathcal{I}}} 93 | \newcommand{\calJ}{\ensuremath{\mathcal{J}}} 94 | \newcommand{\calL}{\ensuremath{\mathcal{L}}} 95 | \newcommand{\calM}{\ensuremath{\mathcal{M}}} 96 | \newcommand{\calN}{\ensuremath{\mathcal{N}}} 97 | \newcommand{\calO}{\ensuremath{\mathcal{O}}} 98 | \newcommand{\calP}{\ensuremath{\mathcal{P}}} 99 | \newcommand{\calR}{\ensuremath{\mathcal{R}}} 100 | \newcommand{\calS}{\ensuremath{\mathcal{S}}} 101 | \newcommand{\calT}{\ensuremath{\mathcal{T}}} 102 | \newcommand{\calU}{\ensuremath{\mathcal{U}}} 103 | \newcommand{\calV}{\ensuremath{\mathcal{V}}} 104 | \newcommand{\calW}{\ensuremath{\mathcal{W}}} 105 | \newcommand{\calX}{\ensuremath{\mathcal{X}}} 106 | \newcommand{\calY}{\ensuremath{\mathcal{Y}}} 107 | 108 | % Relational operators 109 | % \bydef puts ``def'' over the equals sign and means ``is by definition 110 | % equal to''. Another possibility is to use the \triangleq symbol. 111 | \newcommand{\bydef}{\stackrel{\mathrm{\scriptscriptstyle def}}{=}} 112 | % \simbydef puts ``def'' over the ~ sign and means ``is by definition 113 | % distributed as''. 114 | \newcommand{\simbydef}{\stackrel{\mathrm{\scriptscriptstyle def}}{\sim}} 115 | \newcommand{\proptobydef}{\stackrel{\mathrm{\scriptscriptstyle def}}{\propto}} 116 | 117 | % Other functions 118 | \newcommand{\abs}[1]{\left\lvert#1\right\rvert} 119 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 120 | 121 | % Left superscript 122 | % (from http://www.maths.univ-rennes1.fr/~edix/sgahtml/typesetting_rules.html) 123 | \newcommand{\leftexp}[2]{{\vphantom{#2}}^{#1}{#2}} 124 | 125 | % Box of text (for pictures, tables, etc.), without frames. Arguments: 126 | % #1: separation between rows of text as a multiple of LaTeX's default 127 | % (optional, default 1). 128 | % #2: position (one of t, b, c). 129 | % #3: format (one of l, r, c or even p{2cm}). 130 | % #4: text, which can include line breaks (\\). 131 | \newcommand{\caja}[4][1]{{% 132 | \renewcommand{\arraystretch}{#1}% 133 | \begin{tabular}[#2]{@{}#3@{}}% 134 | #4% 135 | \end{tabular}% 136 | }} 137 | 138 | % The amsart class has its own \keywords command, in which case we 139 | % don't replace it. 140 | \providecommand{\keywords}[1]{\vspace*{0.5\baselineskip}\par\noindent\textbf{Keywords:} #1} 141 | 142 | % The logos (AmSLaTeX, BibTeX, etc.) are defined in texnames.sty. 143 | 144 | % List environments 145 | % 146 | % "simplelist": useful for compact lists. Use it instead of "itemize". 147 | % Optional argument: the bullet (default \triangleright). Other nice 148 | % bullets are available in the Zapf Dingbats font (e.g. \ding{166} in 149 | % the pifont package). 150 | % 151 | \newenvironment{simplelist}[1][$\triangleright$]% 152 | {% 153 | \begin{list}{#1}{ 154 | \vspace{-\topsep} 155 | \vspace{-\partopsep} 156 | \setlength{\itemindent}{0cm} 157 | \setlength{\rightmargin}{0cm} 158 | \setlength{\listparindent}{0cm} 159 | \settowidth{\labelwidth}{#1} 160 | \setlength{\leftmargin}{\labelwidth} 161 | \addtolength{\leftmargin}{\labelsep} 162 | \setlength{\itemsep}{0cm} 163 | %\setlength{\leftmargin}{0cm} 164 | %\setlength{\labelwidth}{0cm} 165 | }% 166 | }% 167 | {% 168 | \end{list} 169 | \vspace{-\topsep} 170 | \vspace{-\partopsep} 171 | } 172 | 173 | % 174 | % enumthm: an enumerated list intended to appear in theorems and not 175 | % to be nested. Each item is numbered in Roman numerals in parentheses. 176 | % A suitable way to refer to an item is: 177 | % Theorem~\ref{thm:pithagoras}(\ref{en:triangle}) states... 178 | \newenvironment{enumthm}% 179 | {\begin{enumerate}% 180 | \renewcommand{\theenumi}{\roman{enumi}}% 181 | \renewcommand{\labelenumi}{(\theenumi)}}% 182 | {\end{enumerate}} 183 | 184 | % Commands for the hyperref package (they also require the url package). 185 | % In all cases the typesetting style is tt by default but can be changed 186 | % with the \urlstyle command, e.g. \urlstyle{tt}. 187 | % 188 | % \MACPhref[text]{link} typesets text (default = link) in tt and 189 | % hyperlinks to link, e.g. \MACPhref{http://www.dcs.shef.ac.uk/~miguel}. 190 | \newcommand{\MACPhref}[2][\DefaultOpt]{\def\DefaultOpt{#2}% 191 | \href{#2}{\url{#1}}} 192 | % 193 | % \MACPmailto[text]{email} typesets text (default = email) in tt and 194 | % hyperlinks to email as mailto:email, e.g. \MACPmailto{miguel@dcs.shef.ac.uk}. 195 | \newcommand{\MACPmailto}[2][\DefaultOpt]{\def\DefaultOpt{#2}% 196 | \href{mailto:#2}{\url{#1}}} 197 | 198 | % Hyphenation 199 | \hyphenation{elec-tro-pa-la-tog-ra-phy} 200 | 201 | % From the amsbook.cls file version 2.04 202 | \hyphenation{acad-e-my acad-e-mies af-ter-thought anom-aly anom-alies 203 | an-ti-deriv-a-tive an-tin-o-my an-tin-o-mies apoth-e-o-ses 204 | apoth-e-o-sis ap-pen-dix ar-che-typ-al as-sign-a-ble as-sist-ant-ship 205 | as-ymp-tot-ic asyn-chro-nous at-trib-uted at-trib-ut-able bank-rupt 206 | bank-rupt-cy bi-dif-fer-en-tial blue-print busier busiest 207 | cat-a-stroph-ic cat-a-stroph-i-cally con-gress cross-hatched data-base 208 | de-fin-i-tive de-riv-a-tive dis-trib-ute dri-ver dri-vers eco-nom-ics 209 | econ-o-mist elit-ist equi-vari-ant ex-quis-ite ex-tra-or-di-nary 210 | flow-chart for-mi-da-ble forth-right friv-o-lous ge-o-des-ic 211 | ge-o-det-ic geo-met-ric griev-ance griev-ous griev-ous-ly 212 | hexa-dec-i-mal ho-lo-no-my ho-mo-thetic ideals idio-syn-crasy 213 | in-fin-ite-ly in-fin-i-tes-i-mal ir-rev-o-ca-ble key-stroke 214 | lam-en-ta-ble light-weight mal-a-prop-ism man-u-script mar-gin-al 215 | meta-bol-ic me-tab-o-lism meta-lan-guage me-trop-o-lis 216 | met-ro-pol-i-tan mi-nut-est mol-e-cule mono-chrome mono-pole 217 | mo-nop-oly mono-spline mo-not-o-nous mul-ti-fac-eted mul-ti-plic-able 218 | non-euclid-ean non-iso-mor-phic non-smooth par-a-digm par-a-bol-ic 219 | pa-rab-o-loid pa-ram-e-trize para-mount pen-ta-gon phe-nom-e-non 220 | post-script pre-am-ble pro-ce-dur-al pro-hib-i-tive pro-hib-i-tive-ly 221 | pseu-do-dif-fer-en-tial pseu-do-fi-nite pseu-do-nym qua-drat-ic 222 | quad-ra-ture qua-si-smooth qua-si-sta-tion-ary qua-si-tri-an-gu-lar 223 | quin-tes-sence quin-tes-sen-tial re-arrange-ment rec-tan-gle 224 | ret-ri-bu-tion retro-fit retro-fit-ted right-eous right-eous-ness 225 | ro-bot ro-bot-ics sched-ul-ing se-mes-ter semi-def-i-nite 226 | semi-ho-mo-thet-ic set-up se-vere-ly side-step sov-er-eign spe-cious 227 | spher-oid spher-oid-al star-tling star-tling-ly sta-tis-tics 228 | sto-chas-tic straight-est strange-ness strat-a-gem strong-hold 229 | sum-ma-ble symp-to-matic syn-chro-nous topo-graph-i-cal tra-vers-a-ble 230 | tra-ver-sal tra-ver-sals treach-ery turn-around un-at-tached 231 | un-err-ing-ly white-space wide-spread wing-spread wretch-ed 232 | wretch-ed-ly Eng-lish Euler-ian Feb-ru-ary Gauss-ian 233 | Hamil-ton-ian Her-mit-ian Jan-u-ary Japan-ese Kor-te-weg 234 | Le-gendre Mar-kov-ian Noe-ther-ian No-vem-ber Rie-mann-ian Sep-tem-ber} 235 | 236 | %%% Local Variables: 237 | %%% mode: latex 238 | %%% TeX-master: t 239 | %%% End: 240 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | thesis.pdf: $(wildcard *.tex) $(wildcard chapters/natlog/*.tex) $(wildcard chapters/naturalli/*.tex) $(wildcard chapters/openie/*.tex) $(wildcard chapters/qa/*.tex) Makefile figures_naturalli.tex figures_kbp.tex macros.tex std-macros.tex ref.bib 2 | @pdflatex thesis 3 | @bibtex thesis 4 | @pdflatex thesis 5 | 6 | clean: 7 | rm -f *.aux *.log *.bbl *.blg present.pdf *.bak *.ps *.dvi *.lot *.bcf thesis.pdf 8 | 9 | dist: thesis.pdf 10 | @pdflatex --file-line-errors thesis 11 | 12 | default: thesis.pdf 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jiatao Gu's thesis 2 | 3 | # Title: Efficient Neural Machine Translation 4 | # Abstract: 5 | The dream of automatic translation that builds the communication bridge between people from different civilizations dates back to thousands of years ago. For the past decades, researchers devoted to proposing practical plans, from rule-based machine translation to statistical machine translation. In recent years, with the general success of artificial intelligence (AI) and the emergence of neural network models, a.k.a. deep learning, neural machine translation (NMT), as the new generation of machine translation framework based on sequence-to-sequence learning has achieved the state-of-the-art and even human-level translation performance on a variety of languages. 6 | 7 | The impressive achievements brought by NMT are mainly due to its deep neural network structures with massive numbers of parameters, which can be efficiently tuned from vast volume of parallel data in the order of tens or hundreds of millions of sentences. Unfortunately, in spite of their success, neural systems also bring about new challenges to machine translation, in which one of the central problems is efficiency. The efficiency issue involves two aspects: 8 | (1) NMT is data-hungry because of its vast size of parameters, which makes training a reasonable model difficult in practice for low resource cases. For instance, most of the human languages do not have enough parallel data with other languages to learn an NMT model. Moreover, documents in specialized domains such as law or medicine usually contain tons of professional translations, leading to less efficiency for NMT to learn from; 9 | (2) NMT is slow in computation compared to conventional methods due to its deep structure and limitations of the decoding algorithms. Especially the low efficiency at inference time profoundly affects the real-life application and the smoothness of the communication. In some cases like video conference, we also hope the neural system translates at real-time which, however, is difficult for the existing NMT models. 10 | 11 | This dissertation attempts to tackle these two challenges. 12 | Contributions are twofold: 13 | (1) We address the data-efficiency challenges presented by existing NMT models and introduce insights based on the characteristics of the data, which includes 14 | (a) developing the copy-mechanism to target on rote memories in translation and general sequence-to-sequence learning; 15 | (b) using a non-parametric search-engine to guide the NMT system to perform well in special domains; 16 | (c) inventing a universal NMT system for extremely low resource languages; 17 | (d) extending the universal NMT system to be able to efficiently adapt to new languages by combing with meta-learning. 18 | 19 | (2) For the decoding-efficiency challenges, we develop novel structures and learning algorithms, including 20 | (a) recasting the decoding of NMT in a trainable manner to achieve state-of-the-art performance with less time; 21 | (b) inventing the non-autoregressive NMT system which enables translation in parallel; 22 | (c) developing the NMT model that learns to translate in real-time using reinforcement learning. 23 | -------------------------------------------------------------------------------- /ack.tex: -------------------------------------------------------------------------------- 1 | \prefacesection{Acknowledgments} 2 | 3 | Getting to know Prof. Victor O.K. Li is one of my biggest fortune and the most precious part of my Ph.D. study. 4 | I cannot express my gratitude enough for his tremendous support and extraordinary care over the years, not only for his excellent guidance in my research, but also his exceptional encouragement and help that has strongly influenced my life. 5 | I still remember our first in-person conversation in Beijing before I officially started, where he told me how to solve problems and how to be an influential researcher, which I have obeyed since then. 6 | I still remember when I wrote my first paper in 2015, he spent a week on editing my paper for 7 rounds, correcting the typos and grammatical mistakes due to my non-native English. 7 | I still remember that I received his endless support and encouragement every time when I got rejection and failure, which kept me keep moving towards my goal. 8 | I still remember his care and useful suggestions whenever I was facing difficulties in life that I did not know how to deal with. 9 | I also remain indebted for his understanding and endless support that I could freely determine my research topics based on my interests, and I could have the chances to cooperate with talented researchers around the world. 10 | 11 | 12 | 13 | I would like to thank Dr. Kyunghyun Cho who has also become so important that I cannot measure how much his guidance and support means to me during my Ph.D. research. He was not only a good mentor, but a best friend. I visited him at NYU for 8 months, and even after that our research cooperation still continues. I am so grateful to receive his invaluable insights and suggestions every time we chat about research ideas. 14 | I remain amazed that despite his busy schedule, he was always able to spare time to meet me. 15 | Even though sometimes we disagree with each other on certain problems, such discussion provides precious inspiration to me. 16 | I really appreciate his generous suggestions and help during my difficult time to determine my career path. 17 | 18 | 19 | I want to particularly thank Dr. Zhengdong Lu, for introducing me to the world of doing deep learning research. 20 | When I just finished my first year Ph.D. study, 21 | struggling hard by myself without anyone familiar with my interests in my department, 22 | he gave me the opportunity of interning at his group in Huawei Noah's Ark Lab twice a week. 23 | This opportunity completely changed the following trajectory of my research path, 24 | and I started to know the ways to work on deep learning, 25 | and the important problems in the research community. 26 | I was also amazed by his endless interesting ideas and his knowledge on good research topics, 27 | and they are continuously supporting me over the years. 28 | 29 | I want to thank Dr. Hany Hassan for his generous care and help during my internship at the Machine Translation team of Microsoft Research. 30 | It was great pleasure to work with him, and I learned a lot from his expertise in machine translation. 31 | I would also like to thank James Bradbury, Dr. Richard Socher, Dr. Caiming Xiong and other members in the Salesforce Research team when I was interning there. I cannot forget all the invaluable time we spent at discussing research and life. Very special thanks to James for giving me the best courses of PyTorch and coding. 32 | 33 | I also want to thank all my paper collaborators, 34 | Dr. Hang Li, 35 | Dr. Graham Neubig, 36 | Daniel Im, 37 | Baotian Hu, 38 | Yong Wang. 39 | This thesis would not have been possible without their help, support and guidance. 40 | 41 | I owe a big thank-you to Dr. Ji Wu and Ping Lv when I was in Tsinghua, and to Dr. Danushka Bollegala when I was in UTokyo, for introducing me to this amazing world of natural language processing, and for the support over the past many years. 42 | %I owe a big thank-you to Prof. Peter Doerschuk from the Biomedical Engineering department at Cornell, for all the encouragement and support when I was trying to change my research field, and for telling me nothing is more important that doing things that I am happy about. 43 | 44 | During my years of Ph.D. study, I have been blessed with the friendship of many amazing people. Without them, I could not enjoy the joyful life during these years. 45 | I want to thank all friends I met in Victor Li's group: 46 | Ruiqiao Bai, 47 | Miaomiao Cao, 48 | Guanhua Chen, 49 | Xiangyu Chen, 50 | Xuejing Chen, 51 | Yun Chen, 52 | Kai Fung Chu, 53 | Jie Chuai, 54 | Reza Hassani Etemad, 55 | Yang Han, 56 | Albert Y.S. Lam, 57 | Jacqueline Lam, 58 | K.C. Leung, 59 | Yan Li, 60 | Junhao Lin, 61 | Yi Long, 62 | Zhiyi Lv, 63 | Yijie Mao, 64 | Qiqi Shuai, 65 | Shiguang Song, 66 | Chenxi Sun, 67 | Yi Sun, 68 | Jiayao Wen, 69 | Miles H.F. Wen, 70 | Jiao Wang, 71 | Yong Wang, 72 | Jiancheng Ye, 73 | James J.Q. Yu, 74 | Yangwen Yu, 75 | Qi Zhang, 76 | Shiyao Zhang, 77 | Xingzheng Zhu, 78 | who provide me with a sense of a second family. 79 | My thanks also go to so many wonderful students, researchers and professors in HKU and around the world that I have had the opportunity of befriending or learning from, including 80 | Eneko Agirre, 81 | Martin Arjovsky, 82 | Danlu Chen, 83 | Meihao Chen, 84 | Keunwoo Choi, 85 | Jonathan Clark, 86 | Trevor Cohn, 87 | Chris Dyer, 88 | Akiko Eriguchi, 89 | Orhan Firat, 90 | Krzysztof Geras, 91 | Caglar Gulcehre, 92 | Xuejun Guo, 93 | Yicheng Gong, 94 | Karl Moritz Hermann, 95 | Sebastien Jean, 96 | Kevin Knight, 97 | Dong Li, 98 | Qun Liu, 99 | Wei Liu, 100 | Stanislas Lauly, 101 | Xuezhe Ma, 102 | Lili Mou, 103 | Rodrigo Nogueira, 104 | Alexander Rosenberg, 105 | Xing Shi, 106 | Qiqi Shuai, 107 | Yu Su, 108 | Ilya Sutskever, 109 | Yaohua Tang, 110 | Zhaopeng Tu, 111 | Jakob Uszkoreit, 112 | Longyue Wang, 113 | Tian Wang, 114 | Yingjie Xue, 115 | Pengcheng Yin, 116 | Jiakai Zhang, 117 | Xingxing Zhang, 118 | Jake Zhao, 119 | Chunting Zhou, 120 | and many many others who I am grateful for but cannot list all their names. 121 | I want to specially thank my friends: Jason Lee and Zhen Li; 122 | my best undergraduate friends: Jin Hu, Yilin Hung and Ruochen Xu; 123 | my old high-school friends: Xingchen Bu, Licheng Tang, Bi Xu, and Zhonghao Zhang; 124 | and my nice landlords when I was in the U.S.: Jia Li and Sida Liu, 125 | for always encouraging me and did not leave me alone when the Ph.D. life is full of difficulty and sadness. 126 | 127 | 128 | I want to thank Dr. Maggie Li, Dr. Kaibin Huang, and Prof. Lawrence Yeung 129 | for agreeing to be the examiners of my oral defense, and helping me all the way through the thesis. %I also want to thank Prof. Olivier Gevaert for helping chair my thesis defense. 130 | 131 | Finally, I would like to dedicate this thesis to my beloved parents and girlfriend Lingjie for their love and never-ending support. 132 | They have been a constant source of strength that ultimately made it possible for me to come so far. 133 | My debt to them is unbounded. 134 | \cleardoublepage % Force a break to an odd page 135 | -------------------------------------------------------------------------------- /chapters/background.tex: -------------------------------------------------------------------------------- 1 | In this chapter, we will introduce some background knowledge about neural machine translation on three aspects, 2 | namely, \textit{modeling}, \textit{training}, and \textit{decoding}. 3 | 4 | \section{Modeling} 5 | We start by discussing the building blocks that forms the most recent Neural Machine Translation (NMT) models. 6 | For all of different existing architectures, NMT can always be seen as a conditional language model. 7 | The introduction of the attention mechanism becomes the key which enables NMT to achieve the state-of-the-art performance. 8 | 9 | \subsection{Neural Language Modeling} 10 | Before discussing the details of neural machine translation, we first come to the general topic of generating sentence using neural networks, a.k.a. neural language modeling. The research of language modeling with neural networks dates back to \citep[NNLM, ][]{bengio2003neural} where the authors used a feed-forward network to predict next words with a fixed window size of words as input. In general, when we consider the problem of natural language modeling, a simplified view is to model languages (e.g. sentences) in a generative framework: let $V$ be the vocabulary of all possible tokens, for a sentence $Y=\{y_1, y_2, ..., y_T\}$, $y_t \in V$, the language model finds a set of parameters $\theta$ to represent the reconstruction probability 11 | $p(Y;\theta)=p(y_1, y_2, ..., y_T; \theta)$. 12 | \paragraph{Autoregressive Language Model} Directly modeling the probability of the entire sentence $Y$ is hard and intractable. Therefore aside from some research works, most of previous efforts reformulates the probability in an autoregressive way, that is, 13 | \begin{equation} 14 | \label{cp2.eq.autolm} 15 | p(Y;\theta) = \prod_{t=1}^{T+1}p(y_t|y_{0:t-1}; \theta), 16 | \end{equation} 17 | where special tokens $y_0$ (e.g. $\langle \mathrm{bos}\rangle$) and $y_{T+1}$ (e.g. $\langle \mathrm{eos}\rangle$) are used to represent the beginning and end of all target sentences. The intuitive view is that the language model shows that each generated tokens $y_t$ is conditional to all previously generated tokens. 18 | % Note that, Eq.~\ref{cp2.eq.autolm} does not require any approximation. 19 | 20 | \paragraph{Parameterization} 21 | We are interested in parameterizing (approximating) these conditional probabilities using neural networks, which can be feed-forward networks, convolutional networks, or recurrent neural networks~\citep[RNNLM, ][]{mikolov2010recurrent}. 22 | Take the RNNLM -- a generic formulation for language model -- as an example. For each time-step $t$, the whole history $y_{0: t-1}$ is summarized as: 23 | \begin{equation} 24 | \begin{split} 25 | z_t = f(\epsilon_I[y_{t-1}], z_{t-1}; {\theta}_f) , t \in [1, T] 26 | \end{split} 27 | \end{equation} 28 | where $\epsilon_I$ is an embedding matrix and each row maps a fixed-length vector for each input words, $f$ is the recurrent networks used to summarize the history. In practise, instead of using the vanilla RNN, RNNs with gated units are often used to capture long-term dependency in language modeling, for instance, Long-short term memory~\citep[LSTM,][]{hochreiter1997long} or Gatetd Recurrent Units~\citep[GRU,][]{cho2014learning}. 29 | Therefore, we can compute the output probability of $y_t$ by 30 | \begin{equation} 31 | \label{cp2.eq.output} 32 | p(y_t|y_{0: t-1};\theta) = \mathop{\softmax}\limits_{y_t\in V}\left(\epsilon_O[y_t]\cdot z_t^T\right) 33 | \end{equation} 34 | where $\softmax(x) = \left. {e^x} \large / {\sum_{x'} e^{x'}} \right.$ is to ensure the probabilities sum to $1$, and $\epsilon_O$ is the output weights which are sometimes tied with $\epsilon_I$. More complicated structure with multiple layers of networks can be used to enhance the modeling ability. 35 | 36 | 37 | %These conditional probabilities are parameterized using a neural network. Typically, an encoder-decoder architecture~\citep{sutskever2014sequence} with a unidirectional RNN-based decoder is used to capture the causal structure of the output distribution. 38 | 39 | \subsection{Sequence-to-Sequence Learning} 40 | Neural language modeling tells how likely a natural language sentence can be generated from nowhere, which however, is not practical in real applications. In most cases, what we are interested in is to generate meaningful sentences with certain conditions, which includes not only machine translation -- the main focus of this thesis -- but many other tasks like summarization, captioning, question answering, and response generation in dialog system. Fortunately, it is easy to borrow the experience of language modeling by assuming a conditional language modeling. 41 | It is also known as sequence-to-sequence ($\sts$) learning when the conditioning inputs are also sequences, e.g. neural machine translation. 42 | 43 | \paragraph{Neural Machine Translation as \sts Learning} 44 | Let us denote $X=\left\{ x_1, \ldots, x_{T'} \right\}$ and $Y=\left\{ y_1, \ldots, y_T \right\}$ the source and target sentences in machine translation, respectively. Similar to language modeling, neural machine translation models the target sentence given the source sentence as a conditional autoregressive language model: 45 | \begin{equation} 46 | \label{cp2.eq.auto_sts} 47 | p(Y|X) = \prod_{t=1}^{T+1}p(y_t|y_{0:t-1}, x_{1:T'}; \theta), 48 | \end{equation} 49 | We can use the same output function as Eq.~\ref{cp2.eq.output} for each probability where in \sts learning, and the hidden states $z_t$ summarizes both the decoded and the source words 50 | \begin{equation} 51 | \label{cp2.eq.hidden_state} 52 | \begin{split} 53 | z_t = f^{\textsc{dec}}(\epsilon_I[y_{t-1}], z_{t-1}, c_t(x_{1:T'}); {\theta}_f), t \in [1, T+1] 54 | \end{split} 55 | \end{equation} 56 | where $c_t(x_{1:T'})$ is the context representation of the source sentence $X$. For instance, a simple implementation proposed in \newcite{cho2014learning,sutskever2014sequence} is to use another RNN to encode the source words into a series of vector representations $h_1, ..., h_{T'}$ as 57 | \begin{equation} 58 | \label{cp2.eq.rnn_encoder} 59 | h_\tau = f^{\textsc{enc}}\left(\epsilon_{E}[x_\tau], h_{\tau-1}; \theta_e\right), \tau \in [1, T'] 60 | \end{equation} 61 | where each source word $x_\tau$ is projected into a continuous embedding space with $\epsilon_{E}$. Then, it is possible to use the last state $h_{T'}$ as the context $c$ considering it has compressed all the information we need to get the correct translation. 62 | 63 | Borrowing the concepts from information theory, we usually call the $ f^{\textsc{enc}}$ and $ f^{\textsc{dec}}$ in \sts learning as the encoder and decoder, respectively. Intuitively in NMT, we first encode the whole source sentence $X$ into a fixed length vector $c = h_{T'}$, and then we decode the translation from it. 64 | 65 | \begin{figure} 66 | \centering 67 | \includegraphics[width=\textwidth]{figs/background/s2s_att.pdf} 68 | \caption{An illustration of the comparison between the conventional \sts learning and \sts with attention mechanism for translating ``A B C D $\rightarrow$ X Y Z ''.} 69 | \label{cp2.fig.comparison} 70 | \end{figure} 71 | 72 | \subsection{Attention Mechanism} 73 | Obviously, it is difficult to encode all the necessary information for translation into a fixed size vector, especially when the source sentence gets long. 74 | To ease such burden, the attention mechanism was then introduced in~\newcite{bahdanau2014neural}, which was also inspired by the concept of ``alignment'' between source and target words in statistical machine translation. Instead of using a single vector for the whole decoding path, the attention mechanism uses a dynamically changing context $c_t$ in the decoding process. A natural option is to represent $c_t$ as the weighted sum of the source hidden states, i.e. 75 | \begin{equation} 76 | \label{cp2.eq.att} 77 | c_t = \sum_{\tau=1}^{T'}{\alpha_{t\tau} h_{\tau}}, \quad \alpha_{t\tau} = \mathop{\softmax}\limits_\tau\left(f^{\textsc{att}}(z_{t-1}, h_{\tau}; \theta_a)\right), 78 | \end{equation} 79 | where $\alpha$ are the attention weights treated as soft-alignment, and $f^{\textsc{att}}$ is the function that shows the correspondence strength for attention, which can be approximated either with a multi-layer neural network or the inner-products between $z_{t-1}$ and $h_\tau$. 80 | Note that in~\newcite{bahdanau2014neural}, the source sentence is encoded with a bidirectional instead of a unidirectional RNN discussed above in Eq.~\ref{cp2.eq.rnn_encoder}, making each hidden state $h_{\tau}$ aware of the contextual information from both ends. 81 | 82 | The attention mechanism nicely solves the information compression problem in machine translation, and is the key for NMT to be able to achieve the state-of-the-art performance. See a comparison with a conventional \sts model in Fig.~\ref{cp2.fig.comparison}. For the following sections and chapters, unless stated otherwise, we consider the RNN-based \sts model with attention mechanism as the basic NMT model used for exploration. 83 | 84 | 85 | 86 | \subsection{Neural Machine Translation without RNNs} 87 | Since the entire target translation is known at training time (see more details in \S\ref{cp2.sec.mle}), the calculation of later conditional probabilities (and their corresponding losses) does not depend on the output words chosen during earlier decoding steps. 88 | Even though decoding must remain entirely sequential during inference, models can take advantage of this parallelism during training. Therefore, a series of new approaches without RNN connections are quickly replacing the original RNN-based models. 89 | For instance, such approaches replace RNNs in the decoder with parallelizable modules like masked convolution~\cite{kalchbrenner2016neural, gehring2017convolutional} or self-attention (the Transformer)~\cite{vaswani2017attention}. A detailed illustration of the latter model is shown in Fig.~\ref{cp2.fig.transformer}. %that provide the causal structure required by the autoregressive factorization. 90 | \begin{figure}[hptb] 91 | \includegraphics[width=\linewidth]{figs/background/transformer.pdf} 92 | \caption{\label{cp2.fig.transformer} A flow-chart of the Transformer model from \newcite{vaswani2017attention}. } 93 | \end{figure} 94 | 95 | %\paragraph{The Transformer Model} 96 | %We specially introduce the Transformer model proposed in \newcite{vaswani2017attention} as an important example. 97 | %The Transformer model 98 | %A recently introduced option which reduces sequential computation still further is to construct the decoder layers out of self-attention computations that have been causally masked in an analogous way. 99 | %The state-of-the-art Transformer network takes this approach, which allows information to flow in the decoder across arbitrarily long distances in a constant number of operations, asymptotically fewer than required by convolutional architectures \citep{vaswani2017attention}. 100 | 101 | \section{Training} 102 | \subsection{Data} 103 | \paragraph{Parallel corpora} The training of a typical NMT system requires large amount of parallel corpus, which is usually collected from bilingual documents done by human translators. However, as we have already noted before, a typical NMT model usually requires millions of translated sentence pairs. 104 | %\paragraph{Monolingual corpora} 105 | \paragraph{Vocabulary and Sub-word level translation} 106 | Training a standard NMT system usually needs a dictionary to store all the words which we normally call it the vocabulary. Typically, the size of the vocabulary has to be big (around $40,000$) so that it can generally assign embeddings for the majority of the words. If some words appear too rare to be calculated, a special symbol out-of-vocabulary (OOV) would be used as a replacement. 107 | 108 | To overcome the large vocabulary issue and the OOV problem to some extent, \newcite{sennrich2015neural} proposed to preprocess the training data by splitting rare words into pieces -- byte-pair-encoding (BPE) -- and then we run the NMT model at this sub-word level on both source and target sides. In general, the usage of BPE can normally increase the translation performance by $2$ BLEU scores compared to the word-level models. Moreover, research works such as \newcite{kim2016character,chung2016character,lee2016fully} proceed one step further and model the translation directly at the character-level. 109 | We will address the data problem in the later chapters. 110 | 111 | \subsection{Maximum Likelihood Learning} 112 | \label{cp2.sec.mle} 113 | We train an NMT model, or equivalently estimate $\theta =\{\theta_f, \theta_e, \theta_a, \epsilon_E, \epsilon_I, \epsilon_O \}$, by maximizing the log-probability of the target translation $Y$ given a source sentence $X$. That is, we maximize the log-likelihood function: 114 | \begin{equation} 115 | \label{cp2.eq.learning} 116 | \mathcal{L}^{\text{ML}}(\theta) = \frac{1}{N} \sum_{n=1}^N \sum_{t=1}^{T_n+1} \log p(y_t^n| y_{0:t-1}^n, x_{1:T'}^n; \theta), 117 | \end{equation} 118 | given a training set consisting of $N$ parallel sentence pairs. One of the fascinating things about NMT and deep learning is that, once we got the formulation of a differentiable objectives, the whole model parameters -- no matter how complex the model -- can be trained end-to-end using stochastic gradient decent (SGD) via back-propagtation~\cite{rumelhart1986learning}. In practise, we can use modified grandients outputted from an optimizer function, such as Adadelta~\cite{zeiler2012adadelta} or Adam~\cite{kingma2014adam} to achieve a faster convergence speed and better performance . 119 | 120 | It is important to note that this maximum likelihood learning does not take into account how a trained model would be used. Rather, it is only concerned with learning a distribution over all possible translations. 121 | 122 | %\subsection{Advanced Learning Algorithms} 123 | %. More advanced training algorithms for NMT have been proposed in recent years~\citep{wiseman2016sequence,shen2015minimum,bahdanau2016actor,ranzato2015sequence}. 124 | 125 | %\subsection{Multilingual Training of Neural Machine Translation} 126 | 127 | \section{Decoding} 128 | Once the model is trained, either by maximum likelihood learning or by any other recently proposed algorithms above, we can let the model translate a given sentence by finding a translation that maximizes 129 | \begin{equation} 130 | \hat{Y} = \argmax_{Y} \log p(Y|X; \theta) =\argmax_{y_1, ..., y_T}\sum_{t=1}^{T+1} \log p(y_t| y_{0:t-1}, x_{1:T'_n}; \theta), 131 | \end{equation} 132 | where $\theta=\{\theta_f, \theta_e, \theta_a, \epsilon_E, \epsilon_I, \epsilon_O \}$, $y_0 = \langle \mathrm{bos}\rangle$ and $y_{T+1} = \langle \mathrm{eos}\rangle$. This process is so-called \textit{decoding}, which is, however, computationally intractable as the number of possible solutions grows exponentially when the decoding length $T$ increases. Although the training algorithms ensure a good approximation for the translation distribution, it does not tell how to find the translation given the distribution function. It is a usual practice to resort to approximate decoding algorithms. 133 | 134 | \subsection{Greedy Decoding} 135 | 136 | One such approximate decoding algorithm is greedy decoding. In greedy decoding, we follow the conditional dependency path and pick the symbol with the highest conditional probability so far at each step. This is equivalent to picking the best symbol one at a time from left to right in conditional language modeling. A decoded translation of greedy decoding is $\hat{Y} = (\hat{y}_1, \ldots, \hat{y}_T)$, where 137 | \begin{equation} 138 | \hat{y}_t = \argmax_{y \in V} \log p(y|\hat{y}_{0:t-1}, x_{1:T'}; \theta). 139 | \end{equation} 140 | Despite its preferable computational complexity $O(|V| \times T)$, greedy decoding has been found over time to be undesirably sub-optimal.% (see, e.g., \citep{cho2016noisy}.) 141 | 142 | \subsection{Beam Search} 143 | \label{cp2.sec.bs} 144 | A better option to improve the translation quality is to perform beam search. 145 | Unlike greedy decoding which keeps only a single hypothesis, beam search keeps $K (>1)$ hypotheses simultaneously during decoding. 146 | That is, at each time step $t$, beam search picks $K$ hypotheses with the highest scores from the cached $K$ prefix sequences (beam)\footnote{We assume for the initial step $t=0$, we only has one symbol $\mathcal{C}_0 = \{$ \bos $\}$}: 147 | \begin{equation} 148 | \mathcal{C}_{t-1} = \{\tilde{y}_{0:t-1}^{(1)}, \tilde{y}_{0:t-1}^{(2)}, ..., \tilde{y}_{0:t-1}^{(K)} \} 149 | \end{equation} 150 | by evaluating the cumulative conditional probabilities of all candidate hypotheses to obtain the next round of sequences: 151 | \begin{equation} 152 | \mathcal{C}_{t} = \{\tilde{y}_{0:t}^{(1)}, \tilde{y}_{0:t}^{(2)}, ..., \tilde{y}_{0:t}^{(K)} \} = 153 | \mathop{\argsort^K}\limits_{y_t\in V, \\ y_{0:t-1}\in \mathcal{C}_{t-1}} \sum_{t'=0}^t\log p(y_{t'}|y_{0:t'-1}, x_{1:T'}; \theta). 154 | \end{equation} 155 | This process repeats until one or some of the hypotheses hits the end-of-sentence symbol (\eos). Each ended sequence $\tilde{Y}$ will be selected from the cache and in the meantime, the beam size decreases from $K$ to $K - 1$. 156 | When all hypotheses terminate, it returns the best hypothesis from the chosen sequences $\tilde{Y}_1, ..., \tilde{Y}_K$ by a re-ranking function. For instance, a simple but effective way is to use the log-probability normalized by the decoding length: 157 | \begin{equation} 158 | \hat{Y} = \argmax_{\tilde{Y}_1, ..., \tilde{Y}_K} \left(\frac{1}{|Y|}\right)^\alpha\cdot\log p(Y|X; \theta), 159 | \end{equation} 160 | where we use $|Y|$ to show the number of tokens in the decoded sentence, and $\alpha \in [0, 1]$. When we select $\alpha=1$, the re-ranking score is equivalent to a normalized log-likelihood, or so-called perplexity. In practice, we can choose $\alpha \approx 0.6$ which can greatly help to avoid generating too short sentences. 161 | 162 | Despite its superior performance compared to greedy decoding, the computational complexity grows linearly with the size of beam $K$, which makes it less preferable especially in the production environment. 163 | 164 | \subsection{Noisy Parallel Decoding} 165 | \label{cp2.sec.noisy} 166 | Another alternative we can consider is the noisy, parallel decoding (NPD) algorithm proposed in \newcite{cho2016noisy}. The main idea behind NPD algorithm is that a better translation with a higher log-probability may be found by injecting unstructured noise in the transition function of the decoder. That is, 167 | \begin{equation} 168 | z_t = f^{\textsc{dec}}(\epsilon_I[y_{t-1}], z_{t-1} + n_t, c_t(x_{1:T'}); \theta_f), t\in [1, T+1] 169 | \end{equation} 170 | where $n_t \sim \mathcal{N}(0, {\sigma_0^2} \large / {t^2})$ is the time-dependent noise. In practice, we can also use other ways to inject randomness in the inner representation of the decoding process, and run greedy decoding based on the ``noisy'' inputs. 171 | NPD avoids potential degradation of translation quality by running such a noisy greedy decoding process multiple times in parallel. An important lesson of NPD algorithm is that there exists a decoding strategy with the asymptotically same computational complexity that results in a better translation quality, and that such a better translation can be found by manipulating the hidden state of the RNNs. 172 | 173 | -------------------------------------------------------------------------------- /chapters/conclusion.tex: -------------------------------------------------------------------------------- 1 | In this dissertation, we have presented methods for developing an efficient neural machine translation system. 2 | We discussed our methods to tackle on {\it data-efficiency} and {\it decoding-efficiency}. 3 | In this section, we will conclude this thesis by summarizing the contributions of each chapter, and discussing the challenges still remaining in building a more efficient and also more effective neural machine translation, and suggesting future work to tackle them. 4 | 5 | In \textbf{Chapter~\ref{copy}}, we developed the copying mechanism as a new component which targets rote memories in general sequence-to-sequence (\sts) learning. The proposed \copynet also provides a general way in NMT to deal with out-of-vocabulary (OOV) words; 6 | 7 | In \textbf{Chapter~\ref{seg-nmt}}, we used a non-parametric search-engine to search similar translation pairs for guiding the target translation of a standard NMT system. The introduced method was a direct derivation from \copynet in NMT, but forms a novel non-parametric NMT system able to work on translation for professional domains; 8 | 9 | In \textbf{Chapter~\ref{ulr}}, we targeted on the other direction of data-inefficiency, and invents a universal NMT system for extremely low resource languages. Different from prior works on multi-lingual translation, we specially designed two additional modules which enable a jointly-trained model to work well on extremely low resource languages with around $6,000$ sentences; 10 | 11 | In \textbf{Chapter~\ref{MetaNMT}}, we extended the previously proposed universal NMT system to enable a pre-trained multilingual NMT model to adapt to any new languages efficiently. Moreover, the usage of meta-learning provides a novel view of finding a better initialization of parameters that is easy to fine-tune on new languages. 12 | 13 | In \textbf{Chapter~\ref{trainable}}, we recasted the NMT decoding as a trainable process. The proposed {\it trainable greedy decoding} not only helped a trained model to decode more efficiently with greedy decoding while achieving better translation quality, but also introduced an interesting direction where we can optimize the decoding towards any favorable objectives. 14 | 15 | In \textbf{Chapter~\ref{nat}}, we invented the non-autoregressive NMT system that enabled translation in parallel. The proposed method, with {\it fertility} as latent variables and sequence-level knowledge distillation, was one of the first NMT systems that was able to fundamentally remove the restriction of word-by-word decoding in conventional NMT models while keeping a similar translation quality. 16 | 17 | In \textbf{Chapter~\ref{simul}}, we developed the NMT model that learns to translate in real-time using reinforcement learning. More precisely, we formulated the simultaneous translation as a reinforcement learning setting where two actions -- read and write -- were taken on a pre-trained normal NMT. The resulted model successfully achieved a balance between delay and quality. 18 | 19 | 20 | \section{Future Work} 21 | We will conclude this dissertation with a discussion of the challenges that remain in building efficient neural machine translation system. 22 | \paragraph{Same quality, better efficiency } 23 | In most cases we have discussed in this dissertation, especially for the discussion of decoding-efficient NMT, we usually find algorithms with much better decoding efficiency ( for example, the non-autoregressive translation (Chapter~\ref{nat}) or simultaneous translation (Chapter~\ref{simul})), whereas that loses a lot considering the translation quality. 24 | 25 | Although real application sometimes cares more about efficiency than the degradation of the translation quality, it is still meaningful to build on the current methods by exploring strategies which can keep the translation quality with almost no loss compared to the normal NMT system. For example, it is reasonable to develop a semi-autoregressive NMT which learns to run parallel decoding only without hurting the translation accuracy. For simultaneous translation, we can also achieve a better quality by learning to predict future words before outputting the target translation (Chapter~\ref{simul}). 26 | 27 | 28 | \paragraph{Light-weight NMT} 29 | In this thesis, we only discussed situations where the {\it efficient} NMT keeps the same or similar size of parameters as the baseline NMT model. Whereas, we did not cover another trend of building the efficient models by compressing the neural network parameters directly to a light-weight model so that it can fit the computation resource of a portable device (e.g. mobile phone), which has also been investigated for years. A widely used method is to compress the model size with knowledge distillation to compress knowledge from a big pre-trained model~\cite{kim2016sequence}. However, more investigation is still needed by combining the proposed methods in previous chapters, say, the light-weight simultaneous translation. 30 | 31 | \paragraph{Incorporating Linguistic Information} 32 | In human translation, linguistic structures and similarity usually help to derive the correct translation when the paired translation data is limited. Thus, it is also important to build a data-efficient system by considering additional information from a linguistic perspective. For example, it is possible to train multiple pre-trained models via meta-learning (Chapter~\ref{MetaNMT}) on a variety of high-resource languages, and match the associated linguistic features to guide any new language to find the best model for adaptation. 33 | 34 | %\paragraph{Beyond Machine Translation} 35 | 36 | -------------------------------------------------------------------------------- /chapters/intro.tex: -------------------------------------------------------------------------------- 1 | \epigraph{Translation is that which transforms everything so that nothing changes.}{Grass Günter.} 2 | 3 | The dream of automatic translation that builds the communication bridge between people from different civilizations dates back to thousands of years (see Fig.~\ref{cp1.fig.babel}). The ability of an algorithm to translate between any human languages smoothly and accurately has always been a test on whether an artificial intelligence (AI) is capable of understanding human languages. We will start from a brief review of the research of modern automatic machine translation and then move to our main focus: building an {\it efficient} neural machine translation system. 4 | 5 | \section{A Brief Review of Machine Translation (MT)} 6 | Researchers have been devoted to proposing practical plans since 1950s. 7 | \begin{figure}[t] 8 | \centering 9 | \includegraphics[width=1\linewidth]{figs/intro/tower_of_babel.jpg} 10 | \caption{\label{cp1.fig.babel}The aim in building Babel and a Tower to “reach into heaven” was to prevent the people from being “scattered abroad over the face of the whole earth” (Genesis 11:4). } 11 | \end{figure} 12 | 13 | \paragraph{Rule-based MT} The earliest machine translation systems are rule-based, which requires not only a word-by-word dictionary, but linguists to design specific translation rules language by language. Due to its fragility, the rule-based system easily fails to translate correctly to the countless special cases in human languages, but can usually produce translation with good grammar structure. 14 | 15 | \paragraph{Example-based MT} 16 | The example-based machine translation (EBMT)~\citep{Zhang2005AnEP,callison2005scaling,phillips2012modeling} translates new sentences not based on rules, but reference to stored similar translation examples. In a more practical way, it indexes parallel corpora with suffix arrays and retrieves exactly matched translation segments on the fly at test time. %However, to the best of our knowledge, SEG-NMT is the first work incorporating any attention-based neural machine translation architectures and can be trained end-to-end efficiently, showing superior performance and scalability compared to the conventional statistical EBMT. 17 | 18 | 19 | \paragraph{Statistical MT} Another trend of machine translation is the statistical machine translation. Similar to EBMT, SMT does not rely on manually mined translation rules. For example, one of the most successful statistical methods, phrase-based statistical machine translation (PBMT), first builds a phrase table from a large corpora, which later is used to output translation by searching the most proper sequence of paired phrases that fits to an external language model. 20 | 21 | \paragraph{Neural MT} 22 | In recent years, with the general success of artificial intelligence (AI) and the emergence of neural network models, a.k.a. deep Learning, neural machine translation (NMT)~\cite{sutskever2014sequence,bahdanau2014neural,vaswani2017attention} , as the new generation of machine translation framework, has achieved the state-of-the-art~\cite{wu2016google} and even human-level translation performance in some languages~\cite{hassan-hp}. 23 | Different from prior work in SMT, NMT does not use an explicit language model, but directly model machine translation as a conditional language model via function approximation. 24 | The impressive achievements brought by NMT are mainly due to such approximated function, parameterized by deep neural network, 25 | which can be efficiently tuned from vast volume of parallel data in the order of tens or hundreds of millions of sentences. 26 | 27 | %\begin{figure}[t] 28 | %\centering 29 | %\includegraphics[width=\linewidth]{figs/intro/nmt-history.png} 30 | %\caption{History of machine translation systems. Slide by Christopher D. Manning.} 31 | %\end{figure} 32 | 33 | 34 | \section{Towards Efficient Neural Machine Translation} 35 | 36 | In spite of their success, neural systems also bring about new challenges to machine translation, in which one of the central problems is efficiency. Fundamentally, this ``inefficiency '' of NMT also comes from the deep neural networks which is why NMT can significantly outperform other solutions. For instance, a standard \footnote{Here we refer to a hyper-parameter settings of \texttt{t2t-base} reported in the original paper.} of the recently proposed state-of-the-art NMT model -- the Transformer Model~\cite{vaswani2017attention} -- with a total vocabulary of about $50,000$ subwords requires to learn around $50$ million free parameters from scratch. 37 | Such huge number of parameters bring about the efficiency issue in two aspects: 38 | \begin{figure}[hptb] 39 | \centering 40 | \includegraphics[width=0.6\linewidth]{figs/intro/corpus_size.png} 41 | \caption{\label{cp1.fig.corpus_size} A comparison between NMT and SMT given varying sizes of training examples. Curves extracted from \newcite{koehn2017six}.} 42 | \end{figure} 43 | \begin{itemize} 44 | \item It is significantly data-hungry to train these parameters properly using the current (and almost the only) optimization techniques via back-propagation, which makes training a reasonable model difficult in practice for many low resource cases. 45 | For instance, documents in specialized domains such as law or medical files usually contain tons of professional translations without sufficient examples to remember everything in the network's parameters. Moreover, except for some of the mainstream languages, e.g. English, French or Chinese, most of the human languages lack enough parallel data with other languages to learn a proper NMT model. As shown in Fig.~\ref{cp1.fig.corpus_size}, the poor data-inefficiency of the existing NMT methods compared to traditional methods is also reported as one the most important challenges; 46 | \item The complicated structure of NMT also makes slow computation (for both training and decoding) compared to other conventional methods mentioned above. For example, even with the vast speeding-up by the parallelism techniques (e.g. GPUs, TPUs, etc.), because of the restriction of the model assumptions (see details in Chapter~\ref{background} and Chapter~\ref{nat}), decoding sentences by NMT is still 100 or 1000 times slower than SMT or other methods. The low efficiency at inference time profoundly affects the real-life application and the smoothness of the communication. Furthermore, in cases such as video conference or real-time conversion (e.g. see an illustration of simultaneous translation in Fig.~\ref{cp1.fig.simultaneous}), we also hope the neural system to be more efficient so that it can translate at real-time which, however, is difficult for the existing NMT models. 47 | \end{itemize} 48 | \begin{figure}[hptb] 49 | \centering 50 | \includegraphics[width=0.6\linewidth]{figs/intro/real_time.jpg} 51 | \caption{\label{cp1.fig.simultaneous}The goal of the efficient NMT decoding -- simultaneous translation.} 52 | \end{figure} 53 | For years, much research has been made for both cases in order to build data and decoding efficient NMT systems. As one of the building blocks of this important trend, this dissertation will specially focus on these two directions, and for each of which we claim 3-4 novel contributions to the field of machine translation research. The detailed outline of this thesis is listed below. 54 | 55 | \section{Contributions} 56 | The overall contribution of this thesis are clear in the following three directions: 57 | \begin{itemize} 58 | \item Improved the translation quality on low-resource expressions: 59 | \begin{enumerate} 60 | \item We proposed the first-of-its-kind copying mechanism in sequence to sequence learning targeting on low-resource expressions in special domains; 61 | \item We extended the copying mechanism for machine translation with the guidance of a search engine, and significantly improved the translation performance for low-resource expressions. 62 | \end{enumerate} 63 | \item Improved the translation quality over low-resource languages: 64 | \begin{enumerate} 65 | \item We proposed a novel universal neural machine translation system that helps translate extremely low resource languages; 66 | \item We enabled the universal translation system to adapt to new languages with a small number of examples through meta-learning. 67 | \end{enumerate} 68 | \item Improved the trade-off of translation quality and delay (speed): 69 | \begin{enumerate} 70 | \item We proposed the trainable greedy decoding algorithm with reinforcement learning which matched the performance of greedy decoding close to the complex search algorithms; 71 | \item We proposed the first-of-its-kind non-autoregressive neural machine translation that enables translation in parallel; 72 | \item We proposed the simultaneous neural machine translation which enabled to translate before reading the whole source sentence and achieved a balance between translation quality and delay. 73 | \end{enumerate} 74 | \end{itemize} 75 | 76 | \section{Thesis Outline} 77 | This dissertation attempts to tackle these two challenges discussed above, respectively, with contributions in: {\it data-efficient} (Part \rom{1}) and {\it decoding-efficient} (Part \rom{2}) neural machine translation. 78 | \begin{itemize} 79 | \item First of all, \textbf{ Chapter~\ref{background}} gives an overview of the background for this dissertation, including the modeling, training and decoding for building a general NMT model. 80 | \end{itemize} 81 | In Chapters \ref{copy}, \ref{seg-nmt}, \ref{ulr}, \ref{MetaNMT}, we address the data-efficiency challenges presented by existing NMT models and introduce insights based on the characteristics of the data. 82 | \begin{itemize} 83 | \item \textbf{Chapter~\ref{copy}} develops the copying mechanism as a new component which targets on rote memories in general sequence-to-sequence (\sts) learning. The proposed \copynet also provides a general way in NMT to deal with out-of-vocabulary (OOV) words; 84 | \item \textbf{Chapter~\ref{seg-nmt}} uses a non-parametric search-engine to search similar translation pairs for guiding the target translation of a standard NMT system. The introduced method can be seen as a direct derivation from \copynet in NMT, but forms a novel non-parametric NMT system that is able to work on translation for professional domains; 85 | \item \textbf{Chapter~\ref{ulr}} targets on the other direction of data-inefficiency, and invents a universal NMT system for extremely low resource languages. Different from prior works on multi-lingual translation, the proposed methods specially design two additional modules which enable a jointly-trained model to work well on extremely low resource languages with around 6000 sentences; 86 | \item \textbf{Chapter~\ref{MetaNMT}} extends the previously proposed universal NMT system to enable a pre-trained multilingual NMT model to adapt to any new languages efficiently. Moreover, the usage of meta-learning provides a principled way of finding a better initialization of parameters that is easy to fine-tune on new languages. 87 | \end{itemize} 88 | The second part, Chapters \ref{trainable}, \ref{nat} and \ref{simul}, deal with the decoding-efficiency challenges, and we develop novel structures and learning algorithms. 89 | \begin{itemize} 90 | \item \textbf{Chapter~\ref{trainable}} recasts NMT decoding as a trainable process. The proposed reinforcement learning based algorithm -- {\it trainable greedy decoding} -- not only helps a trained model to decode more efficiently with greedy decoding while achieving better translation quality, but also introduces an interesting direction where we can optimize the decoding towards any favorable objectives. 91 | \item \textbf{Chapter~\ref{nat}} invents the non-autoregressive NMT system that enables translation in parallel. The proposed method, with {\it fertility} as latent variables and sequence-level knowledge distillation, is one of the first NMT systems able to fundamentally remove the restriction of word-by-word decoding in conventional NMT models while keeping a similar translation quality. 92 | \item \textbf{Chapter~\ref{simul}} develops the NMT model that learns to translate in real-time using reinforcement learning. More precisely, we formulate the simultaneous translation as a reinforcement learning setting where two actions -- read and write -- are taken on a pre-trained normal NMT. The resulted model successfully achieves a balance between delay and quality. 93 | \end{itemize} 94 | Finally, we conclude and discuss future avenue for efficient NMT in \textbf{Chapter~10}. In the next chapter, we will start to introduce in detail the standard NMT models as the necessary background. 95 | 96 | 97 | \section{Open-source Implementation} 98 | Most pieces of the work included in this thesis have open-sourced implementation, separately\footnote{All old codes can be found at \url{https://github.com/MultiPath?tab=repositories}}. More precisely, 99 | for the state-of-the-art and combined implementation of all techniques proposed in thesis, 100 | please refer to \url{https://github.com/MultiPath/Squirrel} which is still under frequent maintenance and contains implementation of both the basic neural machine translation model and our proposed efficient neural machine translation with both data- and decoding-efficiency. 101 | -------------------------------------------------------------------------------- /declare.tex: -------------------------------------------------------------------------------- 1 | \prefacesection{Declaration} 2 | I hereby declare that this thesis represents my own work, except where due acknowledgment is made, and that it has not been previously included in a thesis, dissertation or report submitted to this University or to any other institution for a degree diploma or other qualifications. 3 | \\[2cm] 4 | \begin{flushright} 5 | \textit{Signed:} \underline{\hspace{6cm}}\\[.5cm] 6 | Jiatao Gu 7 | \end{flushright} 8 | \cleardoublepage % Force a break to an odd page 9 | -------------------------------------------------------------------------------- /dedication.tex: -------------------------------------------------------------------------------- 1 | \chapter*{} 2 | %\addcontentsline{toc}{chapter}{Abstract} 3 | \begin{dedication} 4 | \usefont{T1}{LobsterTwo-LF}{bx}{it} To my beloved parents. 5 | \end{dedication} 6 | \cleardoublepage -------------------------------------------------------------------------------- /defense_v0.12.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/defense_v0.12.ppt -------------------------------------------------------------------------------- /figs/background/s2s_att.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/background/s2s_att.pdf -------------------------------------------------------------------------------- /figs/background/transformer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/background/transformer.pdf -------------------------------------------------------------------------------- /figs/background/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/background/transformer.png -------------------------------------------------------------------------------- /figs/copynet/Picture1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/Picture1.pdf -------------------------------------------------------------------------------- /figs/copynet/all_ds1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_ds1.pdf -------------------------------------------------------------------------------- /figs/copynet/all_sum1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_sum1.pdf -------------------------------------------------------------------------------- /figs/copynet/all_sum1a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_sum1a.pdf -------------------------------------------------------------------------------- /figs/copynet/all_sum2a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_sum2a.pdf -------------------------------------------------------------------------------- /figs/copynet/all_sum2b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_sum2b.pdf -------------------------------------------------------------------------------- /figs/copynet/all_sum3a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_sum3a.pdf -------------------------------------------------------------------------------- /figs/copynet/all_sum3b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/all_sum3b.pdf -------------------------------------------------------------------------------- /figs/copynet/ds-exp-r2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/ds-exp-r2.pdf -------------------------------------------------------------------------------- /figs/copynet/ds-exp-rX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/ds-exp-rX.pdf -------------------------------------------------------------------------------- /figs/copynet/ds-exp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/ds-exp.pdf -------------------------------------------------------------------------------- /figs/copynet/ds.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/ds.pdf -------------------------------------------------------------------------------- /figs/copynet/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/example.pdf -------------------------------------------------------------------------------- /figs/copynet/intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/intro.pdf -------------------------------------------------------------------------------- /figs/copynet/model-b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/model-b.pdf -------------------------------------------------------------------------------- /figs/copynet/model-c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/model-c.pdf -------------------------------------------------------------------------------- /figs/copynet/model-x.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/model-x.pdf -------------------------------------------------------------------------------- /figs/copynet/model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/model.pdf -------------------------------------------------------------------------------- /figs/copynet/oov.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/oov.pdf -------------------------------------------------------------------------------- /figs/copynet/summary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summary.pdf -------------------------------------------------------------------------------- /figs/copynet/summary2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summary2.pdf -------------------------------------------------------------------------------- /figs/copynet/summary3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summary3.pdf -------------------------------------------------------------------------------- /figs/copynet/summary4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summary4.pdf -------------------------------------------------------------------------------- /figs/copynet/summaryX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summaryX.pdf -------------------------------------------------------------------------------- /figs/copynet/summaryXX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summaryXX.pdf -------------------------------------------------------------------------------- /figs/copynet/summaryXY.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summaryXY.pdf -------------------------------------------------------------------------------- /figs/copynet/summaryXY2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summaryXY2.pdf -------------------------------------------------------------------------------- /figs/copynet/summaryXYZ.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/summaryXYZ.pdf -------------------------------------------------------------------------------- /figs/copynet/syn1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/syn1.pdf -------------------------------------------------------------------------------- /figs/copynet/syn2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/copynet/syn2.pdf -------------------------------------------------------------------------------- /figs/hku-eps-converted-to.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/hku-eps-converted-to.pdf -------------------------------------------------------------------------------- /figs/hku.eps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/hku.eps -------------------------------------------------------------------------------- /figs/intro/corpus_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/intro/corpus_size.png -------------------------------------------------------------------------------- /figs/intro/nmt-history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/intro/nmt-history.png -------------------------------------------------------------------------------- /figs/intro/real_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/intro/real_time.jpg -------------------------------------------------------------------------------- /figs/intro/tower_of_babel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/intro/tower_of_babel.jpg -------------------------------------------------------------------------------- /figs/meta/curve.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/curve.pdf -------------------------------------------------------------------------------- /figs/meta/fi-en.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/fi-en.pdf -------------------------------------------------------------------------------- /figs/meta/framework.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/framework.pdf -------------------------------------------------------------------------------- /figs/meta/illust.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/illust.pdf -------------------------------------------------------------------------------- /figs/meta/lv-en.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/lv-en.pdf -------------------------------------------------------------------------------- /figs/meta/ro-en.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/ro-en.pdf -------------------------------------------------------------------------------- /figs/meta/support_size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/support_size.pdf -------------------------------------------------------------------------------- /figs/meta/tr-en.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/meta/tr-en.pdf -------------------------------------------------------------------------------- /figs/nat/NAT-overall.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/NAT-overall.pdf -------------------------------------------------------------------------------- /figs/nat/NAT-sample-bleu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/NAT-sample-bleu.pdf -------------------------------------------------------------------------------- /figs/nat/NAT-sub.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/NAT-sub.pdf -------------------------------------------------------------------------------- /figs/nat/NAT_exp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/NAT_exp.pdf -------------------------------------------------------------------------------- /figs/nat/ar_nmt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/ar_nmt.pdf -------------------------------------------------------------------------------- /figs/nat/examples1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/examples1.pdf -------------------------------------------------------------------------------- /figs/nat/fertility_example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/fertility_example.pdf -------------------------------------------------------------------------------- /figs/nat/fertility_example2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/fertility_example2.pdf -------------------------------------------------------------------------------- /figs/nat/learning_curve.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/learning_curve.pdf -------------------------------------------------------------------------------- /figs/nat/length_vs_latency.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/nat/length_vs_latency.pdf -------------------------------------------------------------------------------- /figs/seg/bleu_retrieved.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/bleu_retrieved.pdf -------------------------------------------------------------------------------- /figs/seg/cropped_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/cropped_depth.pdf -------------------------------------------------------------------------------- /figs/seg/example-a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-a.pdf -------------------------------------------------------------------------------- /figs/seg/example-b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-b.pdf -------------------------------------------------------------------------------- /figs/seg/example-c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-c.pdf -------------------------------------------------------------------------------- /figs/seg/example-c2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-c2.pdf -------------------------------------------------------------------------------- /figs/seg/example-d.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-d.pdf -------------------------------------------------------------------------------- /figs/seg/example-d2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-d2.pdf -------------------------------------------------------------------------------- /figs/seg/example-e.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-e.pdf -------------------------------------------------------------------------------- /figs/seg/example-f.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-f.pdf -------------------------------------------------------------------------------- /figs/seg/example-g.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-g.pdf -------------------------------------------------------------------------------- /figs/seg/example-g2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-g2.pdf -------------------------------------------------------------------------------- /figs/seg/example-g3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example-g3.pdf -------------------------------------------------------------------------------- /figs/seg/example_dist_train.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/example_dist_train.pdf -------------------------------------------------------------------------------- /figs/seg/framework1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/framework1.pdf -------------------------------------------------------------------------------- /figs/seg/framework2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/framework2.pdf -------------------------------------------------------------------------------- /figs/seg/fuzzy_improv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/fuzzy_improv.pdf -------------------------------------------------------------------------------- /figs/seg/fuzzy_vs_num_pair.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/fuzzy_vs_num_pair.pdf -------------------------------------------------------------------------------- /figs/seg/nmt1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/nmt1.pdf -------------------------------------------------------------------------------- /figs/seg/num_pair_vs_per.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/num_pair_vs_per.pdf -------------------------------------------------------------------------------- /figs/seg/picture.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/picture.pdf -------------------------------------------------------------------------------- /figs/seg/search.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/seg/search.pdf -------------------------------------------------------------------------------- /figs/simultrans/SNMT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/SNMT.pdf -------------------------------------------------------------------------------- /figs/simultrans/crop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/crop.pdf -------------------------------------------------------------------------------- /figs/simultrans/cropped_k.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/cropped_k.pdf -------------------------------------------------------------------------------- /figs/simultrans/cropped_kkkk.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/cropped_kkkk.pdf -------------------------------------------------------------------------------- /figs/simultrans/decoding.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/decoding.pdf -------------------------------------------------------------------------------- /figs/simultrans/example1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/example1.pdf -------------------------------------------------------------------------------- /figs/simultrans/example_c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/example_c.pdf -------------------------------------------------------------------------------- /figs/simultrans/example_c0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/example_c0.pdf -------------------------------------------------------------------------------- /figs/simultrans/example_c2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/example_c2.pdf -------------------------------------------------------------------------------- /figs/simultrans/greedy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/greedy.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_deen_DELAY_vs_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_deen_DELAY_vs_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_ende_DELAY_vs_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_ende_DELAY_vs_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_enru_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_enru_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_enru_DELAY.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_enru_DELAY.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_enru_DELAY_vs_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_enru_DELAY_vs_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_enru_WAIT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_enru_WAIT.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_enru_WAIT_vs_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_enru_WAIT_vs_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_ruen_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_ruen_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_ruen_DELAY.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_ruen_DELAY.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_ruen_DELAY_vs_BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_ruen_DELAY_vs_BLEU.pdf -------------------------------------------------------------------------------- /figs/simultrans/lr_ruen_WAIT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/lr_ruen_WAIT.pdf -------------------------------------------------------------------------------- /figs/simultrans/sim2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/sim2.pdf -------------------------------------------------------------------------------- /figs/simultrans/simultaneous.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/simultrans/simultaneous.pdf -------------------------------------------------------------------------------- /figs/trainable/Concept.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/Concept.pdf -------------------------------------------------------------------------------- /figs/trainable/beam_bleu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/beam_bleu.pdf -------------------------------------------------------------------------------- /figs/trainable/beam_perplexity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/beam_perplexity.pdf -------------------------------------------------------------------------------- /figs/trainable/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/example.pdf -------------------------------------------------------------------------------- /figs/trainable/framework.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/framework.pdf -------------------------------------------------------------------------------- /figs/trainable/greedy_bleu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/greedy_bleu.pdf -------------------------------------------------------------------------------- /figs/trainable/greedy_perplexity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/greedy_perplexity.pdf -------------------------------------------------------------------------------- /figs/trainable/lr_curve.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/lr_curve.pdf -------------------------------------------------------------------------------- /figs/trainable/onestep.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/onestep.pdf -------------------------------------------------------------------------------- /figs/trainable/seq1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/seq1.pdf -------------------------------------------------------------------------------- /figs/trainable/seq2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/seq2.pdf -------------------------------------------------------------------------------- /figs/trainable/seq3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/seq3.pdf -------------------------------------------------------------------------------- /figs/trainable/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/trainable/test.pdf -------------------------------------------------------------------------------- /figs/ulr/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/cat.jpg -------------------------------------------------------------------------------- /figs/ulr/data_matters.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/data_matters.pdf -------------------------------------------------------------------------------- /figs/ulr/data_matters2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/data_matters2.pdf -------------------------------------------------------------------------------- /figs/ulr/effect_of_a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/effect_of_a.pdf -------------------------------------------------------------------------------- /figs/ulr/examples1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/examples1.pdf -------------------------------------------------------------------------------- /figs/ulr/finetuning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/finetuning.png -------------------------------------------------------------------------------- /figs/ulr/missing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/missing.pdf -------------------------------------------------------------------------------- /figs/ulr/missing2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/missing2.pdf -------------------------------------------------------------------------------- /figs/ulr/model2x.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/model2x.pdf -------------------------------------------------------------------------------- /figs/ulr/model3x.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/model3x.pdf -------------------------------------------------------------------------------- /figs/ulr/multi.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/multi.pdf -------------------------------------------------------------------------------- /figs/ulr/preprojection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/preprojection.pdf -------------------------------------------------------------------------------- /figs/ulr/size.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/size.pdf -------------------------------------------------------------------------------- /figs/ulr/vis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/vis.pdf -------------------------------------------------------------------------------- /figs/ulr/vis2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/figs/ulr/vis2.pdf -------------------------------------------------------------------------------- /fitch.sty: -------------------------------------------------------------------------------- 1 | % Macros for Fitch-style natural deduction. 2 | % Author: Peter Selinger, University of Ottawa 3 | % Created: Jan 14, 2002 4 | % Modified: Feb 8, 2005 5 | % Version: 0.5 6 | % Copyright: (C) 2002-2005 Peter Selinger 7 | % Filename: fitch.sty 8 | % Documentation: fitchdoc.tex 9 | % URL: http://quasar.mathstat.uottawa.ca/~selinger/fitch/ 10 | 11 | % License: 12 | % 13 | % This program is free software; you can redistribute it and/or modify 14 | % it under the terms of the GNU General Public License as published by 15 | % the Free Software Foundation; either version 2, or (at your option) 16 | % any later version. 17 | % 18 | % This program is distributed in the hope that it will be useful, but 19 | % WITHOUT ANY WARRANTY; without even the implied warranty of 20 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | % General Public License for more details. 22 | % 23 | % You should have received a copy of the GNU General Public License 24 | % along with this program; if not, write to the Free Software Foundation, 25 | % Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 26 | 27 | % USAGE EXAMPLE: 28 | % 29 | % The following is a simple example illustrating the usage of this 30 | % package. For detailed instructions and additional functionality, see 31 | % the user guide, which can be found in the file fitchdoc.tex. 32 | % 33 | % \[ 34 | % \begin{nd} 35 | % \hypo{1} {P\vee Q} 36 | % \hypo{2} {\neg Q} 37 | % \open 38 | % \hypo{3a} {P} 39 | % \have{3b} {P} \r{3a} 40 | % \close 41 | % \open 42 | % \hypo{4a} {Q} 43 | % \have{4b} {\neg Q} \r{2} 44 | % \have{4c} {\bot} \ne{4a,4b} 45 | % \have{4d} {P} \be{4c} 46 | % \close 47 | % \have{5} {P} \oe{1,3a-3b,4a-4d} 48 | % \end{nd} 49 | % \] 50 | 51 | {\chardef\x=\catcode`\* 52 | \catcode`\*=11 53 | \global\let\nd*astcode\x} 54 | \catcode`\*=11 55 | 56 | % References 57 | 58 | \newcount\nd*ctr 59 | \def\nd*render{\expandafter\ifx\expandafter\nd*x\nd*base\nd*x\the\nd*ctr\else\nd*base\ifnum\nd*ctr<0\the\nd*ctr\else\ifnum\nd*ctr>0+\the\nd*ctr\fi\fi\fi} 60 | \expandafter\def\csname nd*-\endcsname{} 61 | 62 | \def\nd*num#1{\nd*numo{\nd*render}{#1}\global\advance\nd*ctr1} 63 | \def\nd*numopt#1#2{\nd*numo{$#1$}{#2}} 64 | \def\nd*numo#1#2{\edef\x{#1}\mbox{$\x$}\expandafter\global\expandafter\let\csname nd*-#2\endcsname\x} 65 | \def\nd*ref#1{\expandafter\let\expandafter\x\csname nd*-#1\endcsname\ifx\x\relax% 66 | \errmessage{Undefined natdeduction reference: #1}\else\mbox{$\x$}\fi} 67 | \def\nd*noop{} 68 | \def\nd*set#1#2{\ifx\relax#1\nd*noop\else\global\def\nd*base{#1}\fi\ifx\relax#2\relax\else\global\nd*ctr=#2\fi} 69 | \def\nd*reset{\nd*set{}{1}} 70 | \def\nd*refa#1{\nd*ref{#1}} 71 | \def\nd*aux#1#2{\ifx#2-\nd*refa{#1}--\def\nd*c{\nd*aux{}}% 72 | \else\ifx#2,\nd*refa{#1}, \def\nd*c{\nd*aux{}}% 73 | \else\ifx#2;\nd*refa{#1}; \def\nd*c{\nd*aux{}}% 74 | \else\ifx#2.\nd*refa{#1}. \def\nd*c{\nd*aux{}}% 75 | \else\ifx#2)\nd*refa{#1})\def\nd*c{\nd*aux{}}% 76 | \else\ifx#2(\nd*refa{#1}(\def\nd*c{\nd*aux{}}% 77 | \else\ifx#2\nd*end\nd*refa{#1}\def\nd*c{}% 78 | \else\def\nd*c{\nd*aux{#1#2}}% 79 | \fi\fi\fi\fi\fi\fi\fi\nd*c} 80 | \def\ndref#1{\nd*aux{}#1\nd*end} 81 | 82 | % Layer A 83 | 84 | % define various dimensions (explained in fitchdoc.tex): 85 | \newlength{\nd*dim} 86 | \newdimen\nd*depthdim 87 | \newdimen\nd*hsep 88 | \newdimen\ndindent 89 | \ndindent=1em 90 | % user command to redefine dimensions 91 | \def\nddim#1#2#3#4#5#6#7#8{\nd*depthdim=#3\relax\nd*hsep=#6\relax% 92 | \def\nd*height{#1}\def\nd*thickness{#8}\def\nd*initheight{#2}% 93 | \def\nd*indent{#5}\def\nd*labelsep{#4}\def\nd*justsep{#7}} 94 | % set initial dimensions 95 | \nddim{4.5ex}{3.5ex}{1.5ex}{1em}{1.6em}{.5em}{2.5em}{.2mm} 96 | 97 | \def\nd*v{\rule[-\nd*depthdim]{\nd*thickness}{\nd*height}} 98 | \def\nd*t{\rule[-\nd*depthdim]{0mm}{\nd*height}\rule[-\nd*depthdim]{\nd*thickness}{\nd*initheight}} 99 | \def\nd*i{\hspace{\nd*indent}} 100 | \def\nd*s{\hspace{\nd*hsep}} 101 | \def\nd*g#1{\nd*f{\makebox[\nd*indent][c]{$#1$}}} 102 | \def\nd*f#1{\raisebox{0pt}[0pt][0pt]{$#1$}} 103 | \def\nd*u#1{\makebox[0pt][l]{\settowidth{\nd*dim}{\nd*f{#1}}% 104 | \addtolength{\nd*dim}{2\nd*hsep}\hspace{-\nd*hsep}\rule[-\nd*depthdim]{\nd*dim}{\nd*thickness}}\nd*f{#1}} 105 | 106 | % Lists 107 | 108 | \def\nd*push#1#2{\expandafter\gdef\expandafter#1\expandafter% 109 | {\expandafter\nd*cons\expandafter{#1}{#2}}} 110 | \def\nd*pop#1{{\def\nd*nil{\gdef#1{\nd*nil}}\def\nd*cons##1##2% 111 | {\gdef#1{##1}}#1}} 112 | \def\nd*iter#1#2{{\def\nd*nil{}\def\nd*cons##1##2{##1#2{##2}}#1}} 113 | \def\nd*modify#1#2#3{{\def\nd*nil{\gdef#1{\nd*nil}}\def\nd*cons##1##2% 114 | {\advance#2-1 ##1\advance#2 1 \ifnum#2=1\nd*push#1{#3}\else% 115 | \nd*push#1{##2}\fi}#1}} 116 | 117 | \def\nd*cont#1{{\def\nd*t{\nd*v}\def\nd*v{\nd*v}\def\nd*g##1{\nd*i}% 118 | \def\nd*i{\nd*i}\def\nd*nil{\gdef#1{\nd*nil}}\def\nd*cons##1##2% 119 | {##1\expandafter\nd*push\expandafter#1\expandafter{##2}}#1}} 120 | 121 | % Layer B 122 | 123 | \newcount\nd*n 124 | \def\nd*beginb{\begingroup\nd*reset\gdef\nd*stack{\nd*nil}\nd*push\nd*stack{\nd*t}% 125 | \begin{array}{l@{\hspace{\nd*labelsep}}l@{\hspace{\nd*justsep}}l}} 126 | \def\nd*resumeb{\begingroup\begin{array}{l@{\hspace{\nd*labelsep}}l@{\hspace{\nd*justsep}}l}} 127 | \def\nd*endb{\end{array}\endgroup} 128 | \def\nd*hypob#1#2{\nd*f{\nd*num{#1}}&\nd*iter\nd*stack\relax\nd*cont\nd*stack\nd*s\nd*u{#2}&} 129 | \def\nd*haveb#1#2{\nd*f{\nd*num{#1}}&\nd*iter\nd*stack\relax\nd*cont\nd*stack\nd*s\nd*f{#2}&} 130 | \def\nd*havecontb#1#2{&\nd*iter\nd*stack\relax\nd*cont\nd*stack\nd*s\nd*f{\hspace{\ndindent}#2}&} 131 | \def\nd*hypocontb#1#2{&\nd*iter\nd*stack\relax\nd*cont\nd*stack\nd*s\nd*u{\hspace{\ndindent}#2}&} 132 | 133 | \def\nd*openb{\nd*push\nd*stack{\nd*i}\nd*push\nd*stack{\nd*t}} 134 | \def\nd*closeb{\nd*pop\nd*stack\nd*pop\nd*stack} 135 | \def\nd*guardb#1#2{\nd*n=#1\multiply\nd*n by 2 \nd*modify\nd*stack\nd*n{\nd*g{#2}}} 136 | 137 | % Layer C 138 | 139 | \def\nd*clr{\gdef\nd*cmd{}\gdef\nd*typ{\relax}} 140 | \def\nd*sto#1#2#3{\gdef\nd*typ{#1}\gdef\nd*byt{}% 141 | \gdef\nd*cmd{\nd*typ{#2}{#3}\nd*byt\\}} 142 | \def\nd*chtyp{\expandafter\ifx\nd*typ\nd*hypocontb\def\nd*typ{\nd*havecontb}\else\def\nd*typ{\nd*haveb}\fi} 143 | \def\nd*hypoc#1#2{\nd*chtyp\nd*cmd\nd*sto{\nd*hypob}{#1}{#2}} 144 | \def\nd*havec#1#2{\nd*cmd\nd*sto{\nd*haveb}{#1}{#2}} 145 | \def\nd*hypocontc#1{\nd*chtyp\nd*cmd\nd*sto{\nd*hypocontb}{}{#1}} 146 | \def\nd*havecontc#1{\nd*cmd\nd*sto{\nd*havecontb}{}{#1}} 147 | \def\nd*by#1#2{\ifx\nd*x#2\nd*x\gdef\nd*byt{\mbox{#1}}\else\gdef\nd*byt{\mbox{#1, \ndref{#2}}}\fi} 148 | 149 | % multi-line macros 150 | \def\nd*mhypoc#1#2{\nd*mhypocA{#1}#2\\\nd*stop\\} 151 | \def\nd*mhypocA#1#2\\{\nd*hypoc{#1}{#2}\nd*mhypocB} 152 | \def\nd*mhypocB#1\\{\ifx\nd*stop#1\else\nd*hypocontc{#1}\expandafter\nd*mhypocB\fi} 153 | \def\nd*mhavec#1#2{\nd*mhavecA{#1}#2\\\nd*stop\\} 154 | \def\nd*mhavecA#1#2\\{\nd*havec{#1}{#2}\nd*mhavecB} 155 | \def\nd*mhavecB#1\\{\ifx\nd*stop#1\else\nd*havecontc{#1}\expandafter\nd*mhavecB\fi} 156 | \def\nd*mhypocontc#1{\nd*mhypocB#1\\\nd*stop\\} 157 | \def\nd*mhavecontc#1{\nd*mhavecB#1\\\nd*stop\\} 158 | 159 | \def\nd*beginc{\nd*beginb\nd*clr} 160 | \def\nd*resumec{\nd*resumeb\nd*clr} 161 | \def\nd*endc{\nd*cmd\nd*endb} 162 | \def\nd*openc{\nd*cmd\nd*clr\nd*openb} 163 | \def\nd*closec{\nd*cmd\nd*clr\nd*closeb} 164 | \let\nd*guardc\nd*guardb 165 | 166 | % Layer D 167 | 168 | % macros with optional arguments spelled-out 169 | \def\nd*hypod[#1][#2]#3[#4]#5{\ifx\relax#4\relax\else\nd*guardb{1}{#4}\fi\nd*mhypoc{#3}{#5}\nd*set{#1}{#2}} 170 | \def\nd*haved[#1][#2]#3[#4]#5{\ifx\relax#4\relax\else\nd*guardb{1}{#4}\fi\nd*mhavec{#3}{#5}\nd*set{#1}{#2}} 171 | \def\nd*havecont#1{\nd*mhavecontc{#1}} 172 | \def\nd*hypocont#1{\nd*mhypocontc{#1}} 173 | \def\nd*base{undefined} 174 | \def\nd*opend[#1]#2{\nd*cmd\nd*clr\nd*openb\nd*guard{#1}#2} 175 | \def\nd*close{\nd*cmd\nd*clr\nd*closeb} 176 | \def\nd*guardd[#1]#2{\nd*guardb{#1}{#2}} 177 | 178 | % Handling of optional arguments. 179 | 180 | \def\nd*optarg#1#2#3{\ifx[#3\def\nd*c{#2#3}\else\def\nd*c{#2[#1]{#3}}\fi\nd*c} 181 | \def\nd*optargg#1#2#3{\ifx[#3\def\nd*c{#1#3}\else\def\nd*c{#2{#3}}\fi\nd*c} 182 | 183 | \def\nd*five#1{\nd*optargg{\nd*four{#1}}{\nd*two{#1}}} 184 | \def\nd*four#1[#2]{\nd*optarg{0}{\nd*three{#1}[#2]}} 185 | \def\nd*three#1[#2][#3]#4{\nd*optarg{}{#1[#2][#3]{#4}}} 186 | \def\nd*two#1{\nd*three{#1}[\relax][]} 187 | 188 | \def\nd*have{\nd*five{\nd*haved}} 189 | \def\nd*hypo{\nd*five{\nd*hypod}} 190 | \def\nd*open{\nd*optarg{}{\nd*opend}} 191 | \def\nd*guard{\nd*optarg{1}{\nd*guardd}} 192 | 193 | \def\nd*init{% 194 | \let\open\nd*open% 195 | \let\close\nd*close% 196 | \let\hypo\nd*hypo% 197 | \let\have\nd*have% 198 | \let\hypocont\nd*hypocont% 199 | \let\havecont\nd*havecont% 200 | \let\by\nd*by% 201 | \let\guard\nd*guard% 202 | \def\ii{\by{$\Rightarrow$I}}% 203 | \def\ie{\by{$\Rightarrow$E}}% 204 | \def\Ai{\by{$\forall$I}}% 205 | \def\Ae{\by{$\forall$E}}% 206 | \def\Ei{\by{$\exists$I}}% 207 | \def\Ee{\by{$\exists$E}}% 208 | \def\ai{\by{$\wedge$I}}% 209 | \def\ae{\by{$\wedge$E}}% 210 | \def\ai{\by{$\wedge$I}}% 211 | \def\ae{\by{$\wedge$E}}% 212 | \def\oi{\by{$\vee$I}}% 213 | \def\oe{\by{$\vee$E}}% 214 | \def\ni{\by{$\neg$I}}% 215 | \def\ne{\by{$\neg$E}}% 216 | \def\be{\by{$\bot$E}}% 217 | \def\nne{\by{$\neg\neg$E}}% 218 | \def\r{\by{R}}% 219 | } 220 | 221 | \newenvironment{nd}{\begingroup\nd*init\nd*beginc}{\nd*endc\endgroup} 222 | \newenvironment{ndresume}{\begingroup\nd*init\nd*resumec}{\nd*endc\endgroup} 223 | 224 | \catcode`\*=\nd*astcode 225 | 226 | % End of file fitch.sty 227 | 228 | -------------------------------------------------------------------------------- /macros.tex: -------------------------------------------------------------------------------- 1 | % (tweaks) 2 | \definecolor{darkred}{rgb}{0.5451, 0.0, 0.0} 3 | \definecolor{darkgreen}{rgb}{0.0, 0.3922, 0.0} 4 | 5 | \def\blue#1{\textcolor{blue}{#1}} 6 | \def\darkblue#1{\textcolor{blue}{#1}} 7 | \def\red#1{\textcolor{red}{#1}} 8 | \def\darkred#1{\textcolor{darkred}{#1}} 9 | \def\green#1{\textcolor{green}{#1}} 10 | \def\darkgreen#1{\textcolor{darkgreen}{#1}} 11 | \def\yellow#1{\textcolor{yellow}{#1}} 12 | \definecolor{burntorange}{HTML}{BF5700} 13 | \def\orange#1{\textcolor{burntorange}{#1}} 14 | \def\gray#1{\textcolor{gray}{#1}} 15 | \def\darkgray#1{\textcolor{darkgray}{#1}} 16 | 17 | \newcommand\sys[1]{\textsc{#1}} 18 | %\newcommand\w[1]{\textit{\darkgreen{#1}}} 19 | \newcommand\w[1]{\ww{#1}} % note: wrong for presentations! 20 | \newcommand\ww[1]{\textit{#1}} 21 | \newcommand{\indentitem}{\setlength\itemindent{25pt}} 22 | \newcommand{\nth}{$^{\textrm{th}}$} 23 | 24 | \newcommand\denote[1]{\ensuremath{\llbracket\ww{#1}\rrbracket}} 25 | 26 | \newcommand\forward{\ensuremath{\sqsubseteq}} 27 | \newcommand\nforward{\ensuremath{\not\sqsubseteq}} 28 | \newcommand\reverse{\ensuremath{\sqsupseteq}} 29 | \newcommand\alternate{\ensuremath{\downharpoonleft\hspace{-1.25mm}\upharpoonright}} 30 | \newcommand\cover{\ensuremath{\smallsmile}} 31 | \newcommand\equivalent{\ensuremath{\equiv}} 32 | \newcommand\negate{\ensuremath{\curlywedge}} 33 | \newcommand\independent{\ensuremath{\#}} 34 | \newcommand\tagUp[1]{#1\ensuremath{^\uparrow}} 35 | \newcommand\tagDown[1]{#1\ensuremath{^\downarrow}} 36 | \newcommand\join{\ensuremath{\bowtie}} 37 | 38 | %\newcommand\h[1]{\textbf{#1}} 39 | \newcommand\hh[1]{\textbf{\textcolor[rgb]{0.5,0,0}{#1}}} 40 | \def\ent#1{\text{\small{\textsc{#1}}}} 41 | \def\typ#1{\textit{#1}} 42 | %\makeatletter 43 | %\newcommand{\xRightarrow}[2][]{\ext@arrow 0359\Rightarrowfill@{#1}{#2}} 44 | %\makeatother 45 | 46 | \def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;} 47 | \newcommand{\xmark}{\textrm{\ding{55}}} 48 | %\newcommand{\valid}{\ensuremath{\Rightarrow}} 49 | %\newcommand{\invalid}{\ensuremath{\Rightarrow\lnot}} 50 | \newcommand{\valid}{\ensuremath{\Leftarrow}} 51 | \newcommand{\invalid}{\reflectbox{\ensuremath{\Rightarrow\lnot}}} 52 | \newcommand{\noop}{\textcolor{white}{NOOP}} 53 | \newcommand{\noopTab}{\begin{tabular}{c} \textcolor{white}{NOOP} \\ \textcolor{white}{NOOP} \end{tabular}} 54 | 55 | \newcommand\true[1]{\darkgreen{\checkmark\textit{#1}}} 56 | \newcommand\false[1]{\darkred{\xmark$~\,$\textit{#1}}} 57 | \newcommand\unknown[1]{?\orange{\textit{#1}}} 58 | 59 | \newcommand{\verticalcenter}[1]{\begingroup 60 | \setbox0=\hbox{#1}% 61 | \parbox{\wd0}{\box0}\endgroup} 62 | 63 | 64 | % 65 | % KBP MACROS 66 | % 67 | % A variable, abstracted (e.g., x) 68 | \def\var#1{\ensuremath{\mathbf{{#1}}}} 69 | % A variable instance (e.g., Obama) 70 | \def\vari#1{\ensuremath{#1}} 71 | 72 | 73 | % Aliases 74 | \def\reverb{ReVerb} 75 | \def\hydra{Stanford KBP} 76 | \def\sys#1{#1} 77 | \def\knowbot{\sys{Knowbot}} 78 | 79 | 80 | % KBP Specific 81 | % An entity 82 | \def\ent#1{\text{\small{\textsc{#1}}}} 83 | 84 | % An extraction, e.g., "Obama born_in Hawaii" 85 | \newcommand\extr[3]{\mbox{\ent{#1}\ $~$\rel{#2}\ $~$\ent{#3}}} 86 | \newcommand\triple[3]{(\mbox{\ent{#1}; $~$\rel{#2}; $~$\ent{#3}})} 87 | % A clause in a logical form, e.g., "born_in(Obama, Hawaii)" 88 | \newcommand\clause[3]{\mbox{\rel{#2}\ensuremath{(#1, #3)}}} 89 | 90 | \newcommand\subj[1]{\textcolor{darkblue}{#1}} 91 | \newcommand\obj[1]{\textcolor{burntorange}{#1}} 92 | \newcommand\rel[1]{\textrm{#1}} 93 | 94 | \def\blue#1{\textcolor{blue}{#1}} 95 | \def\red#1{\textcolor{red}{#1}} 96 | \def\green#1{\textcolor{green}{#1}} 97 | \def\yellow#1{\textcolor{yellow}{#1}} 98 | 99 | \newcommand\posterline{ 100 | \begin{center} 101 | \noindent\rule{10cm}{0.4pt} 102 | \end{center} 103 | } 104 | 105 | \newcommand\entailmentExample[2]{ 106 | \vspace{0.5cm} 107 | \noindent \hspace{0.5cm}\begin{tabular}{lp{0.80\textwidth}} 108 | \textbf{P}: & \hspace*{-1mm}\textit{#1} \\ 109 | \textbf{H}: & \hspace*{-1mm}\textit{#2} 110 | \end{tabular} 111 | \vspace{0.5cm} 112 | } 113 | 114 | \tikzset{ 115 | invisible/.style={opacity=0}, 116 | visible on/.style={alt=#1{}{invisible}}, 117 | alt/.code args={<#1>#2#3}{% 118 | \alt<#1>{\pgfkeysalso{#2}}{\pgfkeysalso{#3}} % \pgfkeysalso doesn't change the path 119 | }, 120 | } 121 | 122 | \newenvironment{lquote}{% 123 | \list{}{% 124 | \rightmargin0pt}% 125 | \item\relax 126 | } 127 | {\endlist} 128 | 129 | \newcommand\circled[1]{\tikz[baseline=(char.base)]{ 130 | \node[shape=circle,draw=darkred,inner sep=2pt] (char) {#1};} 131 | } 132 | \newcommand\noncircled[1]{\tikz[baseline=(char.base)]{ 133 | \node[shape=circle,draw=white,inner sep=2pt] (char) {#1};} 134 | } 135 | 136 | 137 | \newcommand{\hnode}[1]{|(#1)| \w{#1}} 138 | \newcommand{\rnode}[2]{|(#1#2)| \w{\textcolor{darkblue}{\textbf{#1}}}\textcolor{white}{#2}} 139 | \newcommand{\bnode}[2]{|(#1#2)| \w{\textcolor{darkblue}{\textcolor{darkred}{\textbf{#1}}}}\textcolor{white}{#2}} 140 | 141 | 142 | \newcommand\longcaption[2]{\caption[#1]{#2}} 143 | -------------------------------------------------------------------------------- /preface.tex: -------------------------------------------------------------------------------- 1 | \chapter*{} 2 | \addcontentsline{toc}{chapter}{Abstract} 3 | \vspace{-140pt} 4 | \begin{center} 5 | \large 6 | Abstract of thesis entitled \par 7 | \Large 8 | ``{\bf Efficient Neural Machine Translation}'' \par 9 | \large 10 | \vspace*{0.2in} 11 | Submitted by \par 12 | Jiatao Gu \par 13 | \vspace*{0.2in} 14 | for the degree of Doctor of Philosophy \\ 15 | at The University of Hong Kong \\ 16 | in August 2018 17 | \end{center} 18 | 19 | 20 | 21 | The dream of automatic translation that builds the communication bridge between people from different civilizations dates back to thousands of years ago. 22 | For the past decades, researchers devoted to proposing practical plans, from rule-based machine translation to statistical machine translation. 23 | In recent years, with the general success of artificial intelligence (AI) and the emergence of neural network models, a.k.a. deep learning, neural machine translation (NMT), 24 | as the new generation of machine translation framework based on sequence-to-sequence learning has achieved the state-of-the-art and even human-level translation performance on a variety of languages. 25 | 26 | The impressive achievements brought by NMT are mainly due to its deep neural network structures with massive numbers of parameters, 27 | which can be efficiently tuned from vast volume of parallel data in the order of tens or hundreds of millions of sentences. 28 | Unfortunately, in spite of their success, neural systems also bring about new challenges to machine translation, in which one of the central problems is efficiency. 29 | The efficiency issue involves two aspects: 30 | (1) NMT is data-hungry because of its vast size of parameters, which makes training a reasonable model difficult in practice for low resource cases. 31 | For instance, most of the human languages do not have enough parallel data with other languages to learn an NMT model. 32 | Moreover, documents in specialized domains such as law or medicine usually contain tons of professional translations, leading to less efficiency for NMT to learn from; 33 | (2) NMT is slow in computation compared to conventional methods due to its deep structure and limitations of the decoding algorithms. 34 | Especially the low efficiency at inference time profoundly affects the real-life application and the smoothness of the communication. 35 | In some cases like video conference, we also hope the neural system translates at real-time which, however, is difficult for the existing NMT models. 36 | 37 | This dissertation attempts to tackle these two challenges. 38 | Contributions are twofold: 39 | (1) We address the data-efficiency challenges presented by existing NMT models and introduce insights based on the characteristics of the data, which includes 40 | (a) developing the copy-mechanism to target on rote memories in translation and general sequence-to-sequence learning; 41 | (b) using a non-parametric search-engine to guide the NMT system to perform well in special domains; 42 | (c) inventing a universal NMT system for extremely low resource languages; 43 | (d) extending the universal NMT system to be able to efficiently adapt to new languages by combing with meta-learning. 44 | (2) For the decoding-efficiency challenges, we develop novel structures and learning algorithms, including 45 | (a) recasting the decoding of NMT in a trainable manner to achieve state-of-the-art performance with less time; 46 | (b) inventing the non-autoregressive NMT system which enables translation in parallel; 47 | (c) developing the NMT model that learns to translate in real-time using reinforcement learning.\\ 48 | (447 words) 49 | 50 | %This dissertation describes a new technique for learning open-domain knowledge from unstructured web-scale text corpora, 51 | % making use of a probabilistic relaxation of natural logic -- 52 | % a logic which uses the syntax of natural language as its logical formalism. 53 | %We begin by reviewing the theory behind natural logic, and propose a novel extension of the logic to handle 54 | % propositional formulae. 55 | % 56 | %We then show how to capture common sense facts: given a candidate statement about the world and a large corpus of 57 | % known facts, is the statement likely to be true? 58 | %This is treated as a search problem 59 | % from the query statement to its appropriate support in the knowledge base over valid (or approximately valid) 60 | % natural logical inference steps. 61 | %This approach achieves a 4x improvement at retrieval recall compared to lemmatized lookup, 62 | % maintaining above 90\% precision. 63 | % 64 | %We then extend the approach to handle longer, more complex premises by segmenting these utterance into a set of 65 | % atomic statements entailed through natural logic. 66 | %We evaluate this system in isolation by using it as the main component in an Open Information Extraction system, 67 | % and show that it achieves a 3\% absolute improvement in F1 compared to prior work on a competitive knowledge 68 | % base population task. 69 | % 70 | %Finally, we address how to elegantly handle situations where we could not find a supporting premise for our query. 71 | %To address this, we create an analogue of an evaluation function in gameplaying search: a shallow lexical 72 | % classifier is folded into the search program to serve as a heuristic function to assess how likely we would 73 | % have been to find a premise. 74 | %Results on answering 4th grade science questions show that this method improves over both the classifier in isolation, 75 | % a strong IR baseline, and prior work. 76 | -------------------------------------------------------------------------------- /publications.tex: -------------------------------------------------------------------------------- 1 | \addcontentsline{toc}{chapter}{List of Publications} 2 | \chapter*{List of Publications} 3 | 4 | \section*{Conference Papers} 5 | 6 | \begin{enumerate} 7 | \item \textbf{Jiatao Gu}, Hany Hassan, Jacob Devlin, and Victor O.K. Li, ``Universal Neural Machine Translation for Extremely Low Resource Languages," in \emph{The Sixteenth Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2018)}. 8 | \item \textbf{Jiatao Gu}, James Bradbury, Caiming Xiong, Victor O.K. Li, and Richard Socher, ``Non-autoregressive Neural Machine Translation," in \emph{Sixth International Conference on Learning Representations (ICLR 2018)}. 9 | \item \textbf{Jiatao Gu}, Daniel. Jiwoong Im, and Victor O.K. Li, ``Neural Machine Translation with Gumbel-Greedy Decoding," in \emph{The Thirty-Second AAAI Conference on Artificial Intelligence (AAAI 2018)}. 10 | \item \textbf{Jiatao Gu}, Yong Wang, Kyunghyun Cho, and Victor O.K. Li, ``Search Engine Guided Neural Machine Translation," in \emph{The Thirty-Second AAAI Conference on Artificial Intelligence (AAAI 2018)}. 11 | \item \textbf{Jiatao Gu}, Kyunghyun Cho, and Victor O.K. Li, ``Trainable Greedy Decoding for Neural Machine Translation," in \emph{The Conference on Empirical Methods in Natural Language Processing (EMNLP 2017)}. 12 | \item Victor O. K. Li, Jacqueline C. K. Lam, Yun Chen, \textbf{Jiatao Gu}, ``Deep Learning Model to Estimate Air Pollution Using M-BP to Fill in Missing Proxy Urban Data'', in \emph{GLOBECOM 2017} 13 | \item \textbf{Jiatao Gu}, Graham Neubig, Kyunghyun Cho, and Victor O.K. Li, ``Learning to Translate in Real-time with Neural Machine Translation,'' in \emph{The Fifteenth Conference of the European Chapter of the Association for Computational Linguistics (EACL 2017)}. 14 | \item \textbf{Jiatao Gu}, Zhengdong Lu, Hang Li, and Victor O.K. Li , ``Incorporating Copying Mechanism in Sequence-to-Sequence Learning,'' in \emph{The Fifty-Fourth Annual Meeting of the Association for Computational Linguistics (ACL 2016)}. 15 | \item \textbf{Jiatao Gu}, and Victor O.K. Li, ``Efficient Learning for Undirected Topic Models", in \emph{The Fifty-Third Annual Meeting of the Association for Computational Linguistics (ACL 2015), Short paper}. 16 | \end{enumerate} 17 | 18 | % •Gu, Jiatao, Hany Hassan, Jacob Devlin, and Victor OK Li (2018).“Universal Neural Machine Translaধon for Extremely Low ResourceLanguages”. In:NAACL 2018.•Gu, Jiatao., Daniel. Jiwoong Im, and Victor OK Li (2018). “NeuralMachine Translaধon with Gumbel-Greedy Decoding”. In:AAAI 2018.•Gu, Jiatao, Yong Wang, Kyunghyun Cho, and Victor OK Li (2018).“Search Engine Guided Non-Parametric Neural Machine Translaধon”.In:AAAI 2018.•Gu, Jiatao, Kyunghyun Cho, and Victor OK Li (2017). “TrainableGreedy Decoding for Neural Machine Translaধon”. In:EMNLP 2017,September 3-7, Copenhagen, Denmark, Long, Oral.•Gu, Jiatao, Graham Neubig, Kyunghyun Cho, and Victor O.K. Li(2017). “Learning to Translate in Real-ধme with Neural MachineTranslaধon”. In:EACL 2017, April 7-11, Valencia, Spain, Long, Oral.•Yu, James JQ, David J Hill, Albert Lam, Jiatao Gu, and Victor OK Li(2017). “Intelligent Time-Adapধve Transient Stability AssessmentSystem”. In:IEEE Transacࣅons on Power Systems.•Gu, Jiatao, Baoধan Hu, Zhengdong Lu, Hang Li, and Victor O.K. Li(2016). “Guided Sequence-to-Sequence Learning with External RuleMemory”. In:ICLR-2016 Workshop.•Gu, Jiatao, Zhengdong Lu, Hang Li, and Victor O. K. Li (2016).“Incorporaধng Copying Mechanism in Sequence-to-SequenceLearning”. In:ACL 2016, August 7-12, Berlin, Germany, Long, Oral.•Gu, Jiatao and Victor O. K. Li (2015). “Efficient Learning forUndirected Topic Models”. In:ACL 2015, July 26-31, Beijing, China,Short, Oral, pp. 162–167. 19 | 20 | 21 | \section*{Workshop Papers} 22 | 23 | \begin{enumerate} 24 | \item \textbf{Jiatao Gu}, Baotian Hu, Zhengdong Lu, Hang Li, and Victor O.K. Li, ``Guided Sequence-to-Sequence Learning with External Rule Memory'', in \emph{Fourth International Conference on Learning Representations (ICLR 2016), Workshop} 25 | \end{enumerate} 26 | 27 | \section*{Journal Articles} 28 | 29 | \begin{enumerate} 30 | \item {J.Q. James, David J Hill, Albert Y.S. Lam, \textbf{Jiatao Gu}, Victor O.K. Li,} ``Intelligent time-adaptive transient stability assessment system'', in \emph{IEEE Transactions on Power Systems} 31 | \end{enumerate} 32 | 33 | \section*{Under-review Articles} 34 | 35 | \begin{enumerate} 36 | \item \textbf{Jiatao Gu}, Yong Wang, Yun Chen, Victor O.K. Li and Kyunghyun Cho, ``Meta-Learning for Low-Resource Language Neural Machine Translation'', under review in \emph{EMNLP 2018} 37 | \end{enumerate} -------------------------------------------------------------------------------- /std-macros.tex: -------------------------------------------------------------------------------- 1 | % version 1.2 05/21/08 2 | \newcommand\sa{\ensuremath{\mathcal{a}}} 3 | \newcommand\sd{\ensuremath{\mathcal{d}}} 4 | \newcommand\se{\ensuremath{\mathcal{e}}} 5 | \newcommand\sg{\ensuremath{\mathcal{g}}} 6 | \newcommand\sh{\ensuremath{\mathcal{h}}} 7 | \newcommand\seye{\ensuremath{\mathcal{i}}} 8 | \newcommand\sj{\ensuremath{\mathcal{j}}} 9 | \newcommand\sk{\ensuremath{\mathcal{k}}} 10 | \newcommand\sm{\ensuremath{\mathcal{m}}} 11 | \newcommand\sn{\ensuremath{\mathcal{n}}} 12 | \newcommand\so{\ensuremath{\mathcal{o}}} 13 | \newcommand\sq{\ensuremath{\mathcal{q}}} 14 | \newcommand\sr{\ensuremath{\mathcal{r}}} 15 | \newcommand\st{\ensuremath{\mathcal{t}}} 16 | \newcommand\su{\ensuremath{\mathcal{u}}} 17 | \newcommand\sv{\ensuremath{\mathcal{v}}} 18 | \newcommand\sw{\ensuremath{\mathcal{w}}} 19 | \newcommand\sx{\ensuremath{\mathcal{x}}} 20 | \newcommand\sy{\ensuremath{\mathcal{y}}} 21 | \newcommand\sz{\ensuremath{\mathcal{z}}} 22 | \newcommand\sA{\ensuremath{\mathcal{A}}} 23 | \newcommand\sB{\ensuremath{\mathcal{B}}} 24 | \newcommand\sC{\ensuremath{\mathcal{C}}} 25 | \newcommand\sD{\ensuremath{\mathcal{D}}} 26 | \newcommand\sE{\ensuremath{\mathcal{E}}} 27 | \newcommand\sF{\ensuremath{\mathcal{F}}} 28 | \newcommand\sG{\ensuremath{\mathcal{G}}} 29 | \newcommand\sH{\ensuremath{\mathcal{H}}} 30 | \newcommand\sI{\ensuremath{\mathcal{I}}} 31 | \newcommand\sJ{\ensuremath{\mathcal{J}}} 32 | \newcommand\sK{\ensuremath{\mathcal{K}}} 33 | \newcommand\sL{\ensuremath{\mathcal{L}}} 34 | \newcommand\sM{\ensuremath{\mathcal{M}}} 35 | \newcommand\sN{\ensuremath{\mathcal{N}}} 36 | \newcommand\sO{\ensuremath{\mathcal{O}}} 37 | \newcommand\sP{\ensuremath{\mathcal{P}}} 38 | \newcommand\sQ{\ensuremath{\mathcal{Q}}} 39 | \newcommand\sR{\ensuremath{\mathcal{R}}} 40 | \newcommand\sS{\ensuremath{\mathcal{S}}} 41 | \newcommand\sT{\ensuremath{\mathcal{T}}} 42 | \newcommand\sU{\ensuremath{\mathcal{U}}} 43 | \newcommand\sV{\ensuremath{\mathcal{V}}} 44 | \newcommand\sW{\ensuremath{\mathcal{W}}} 45 | \newcommand\sX{\ensuremath{\mathcal{X}}} 46 | \newcommand\sY{\ensuremath{\mathcal{Y}}} 47 | \newcommand\sZ{\ensuremath{\mathcal{Z}}} 48 | \newcommand\ba{\ensuremath{\mathbf{a}}} 49 | \newcommand\bb{\ensuremath{\mathbf{b}}} 50 | \newcommand\bc{\ensuremath{\mathbf{c}}} 51 | \newcommand\bd{\ensuremath{\mathbf{d}}} 52 | \newcommand\be{\ensuremath{\mathbf{e}}} 53 | \newcommand\bef{\ensuremath{\mathbf{f}}} 54 | \newcommand\bg{\ensuremath{\mathbf{g}}} 55 | \newcommand\bh{\ensuremath{\mathbf{h}}} 56 | \newcommand\bi{\ensuremath{\mathbf{i}}} 57 | \newcommand\bj{\ensuremath{\mathbf{j}}} 58 | \newcommand\bk{\ensuremath{\mathbf{k}}} 59 | \newcommand\bl{\ensuremath{\mathbf{l}}} 60 | \newcommand\bn{\ensuremath{\mathbf{n}}} 61 | \newcommand\bo{\ensuremath{\mathbf{o}}} 62 | \newcommand\bp{\ensuremath{\mathbf{p}}} 63 | \newcommand\bq{\ensuremath{\mathbf{q}}} 64 | \newcommand\br{\ensuremath{\mathbf{r}}} 65 | \newcommand\bs{\ensuremath{\mathbf{s}}} 66 | \newcommand\bt{\ensuremath{\mathbf{t}}} 67 | \newcommand\bu{\ensuremath{\mathbf{u}}} 68 | \newcommand\bv{\ensuremath{\mathbf{v}}} 69 | \newcommand\bw{\ensuremath{\mathbf{w}}} 70 | \newcommand\bx{\ensuremath{\mathbf{x}}} 71 | \newcommand\by{\ensuremath{\mathbf{y}}} 72 | \newcommand\bz{\ensuremath{\mathbf{z}}} 73 | \newcommand\bA{\ensuremath{\mathbf{A}}} 74 | \newcommand\bB{\ensuremath{\mathbf{B}}} 75 | \newcommand\bC{\ensuremath{\mathbf{C}}} 76 | \newcommand\bD{\ensuremath{\mathbf{D}}} 77 | \newcommand\bE{\ensuremath{\mathbf{E}}} 78 | \newcommand\bF{\ensuremath{\mathbf{F}}} 79 | \newcommand\bG{\ensuremath{\mathbf{G}}} 80 | \newcommand\bH{\ensuremath{\mathbf{H}}} 81 | \newcommand\bI{\ensuremath{\mathbf{I}}} 82 | \newcommand\bJ{\ensuremath{\mathbf{J}}} 83 | \newcommand\bK{\ensuremath{\mathbf{K}}} 84 | \newcommand\bL{\ensuremath{\mathbf{L}}} 85 | \newcommand\bM{\ensuremath{\mathbf{M}}} 86 | \newcommand\bN{\ensuremath{\mathbf{N}}} 87 | \newcommand\bO{\ensuremath{\mathbf{O}}} 88 | \newcommand\bP{\ensuremath{\mathbf{P}}} 89 | \newcommand\bQ{\ensuremath{\mathbf{Q}}} 90 | \newcommand\bR{\ensuremath{\mathbf{R}}} 91 | \newcommand\bS{\ensuremath{\mathbf{S}}} 92 | \newcommand\bT{\ensuremath{\mathbf{T}}} 93 | \newcommand\bU{\ensuremath{\mathbf{U}}} 94 | \newcommand\bV{\ensuremath{\mathbf{V}}} 95 | \newcommand\bW{\ensuremath{\mathbf{W}}} 96 | \newcommand\bX{\ensuremath{\mathbf{X}}} 97 | \newcommand\bY{\ensuremath{\mathbf{Y}}} 98 | \newcommand\bZ{\ensuremath{\mathbf{Z}}} 99 | \newcommand\Ba{\ensuremath{\mathbb{a}}} 100 | \newcommand\Bb{\ensuremath{\mathbb{b}}} 101 | \newcommand\Bc{\ensuremath{\mathbb{c}}} 102 | \newcommand\Bd{\ensuremath{\mathbb{d}}} 103 | \newcommand\Be{\ensuremath{\mathbb{e}}} 104 | \newcommand\Bf{\ensuremath{\mathbb{f}}} 105 | \newcommand\Bg{\ensuremath{\mathbb{g}}} 106 | \newcommand\Bh{\ensuremath{\mathbb{h}}} 107 | \newcommand\Bi{\ensuremath{\mathbb{i}}} 108 | \newcommand\Bj{\ensuremath{\mathbb{j}}} 109 | \newcommand\Bk{\ensuremath{\mathbb{k}}} 110 | \newcommand\Bl{\ensuremath{\mathbb{l}}} 111 | \newcommand\Bm{\ensuremath{\mathbb{m}}} 112 | \newcommand\Bn{\ensuremath{\mathbb{n}}} 113 | \newcommand\Bo{\ensuremath{\mathbb{o}}} 114 | \newcommand\Bp{\ensuremath{\mathbb{p}}} 115 | \newcommand\Bq{\ensuremath{\mathbb{q}}} 116 | \newcommand\Br{\ensuremath{\mathbb{r}}} 117 | \newcommand\Bs{\ensuremath{\mathbb{s}}} 118 | \newcommand\Bt{\ensuremath{\mathbb{t}}} 119 | \newcommand\Bu{\ensuremath{\mathbb{u}}} 120 | \newcommand\Bv{\ensuremath{\mathbb{v}}} 121 | \newcommand\Bw{\ensuremath{\mathbb{w}}} 122 | \newcommand\Bx{\ensuremath{\mathbb{x}}} 123 | \newcommand\By{\ensuremath{\mathbb{y}}} 124 | \newcommand\Bz{\ensuremath{\mathbb{z}}} 125 | \newcommand\BA{\ensuremath{\mathbb{A}}} 126 | \newcommand\BB{\ensuremath{\mathbb{B}}} 127 | \newcommand\BC{\ensuremath{\mathbb{C}}} 128 | \newcommand\BD{\ensuremath{\mathbb{D}}} 129 | \newcommand\BE{\ensuremath{\mathbb{E}}} 130 | \newcommand\BF{\ensuremath{\mathbb{F}}} 131 | \newcommand\BG{\ensuremath{\mathbb{G}}} 132 | \newcommand\BH{\ensuremath{\mathbb{H}}} 133 | \newcommand\BI{\ensuremath{\mathbb{I}}} 134 | \newcommand\BJ{\ensuremath{\mathbb{J}}} 135 | \newcommand\BK{\ensuremath{\mathbb{K}}} 136 | \newcommand\BL{\ensuremath{\mathbb{L}}} 137 | \newcommand\BM{\ensuremath{\mathbb{M}}} 138 | \newcommand\BN{\ensuremath{\mathbb{N}}} 139 | \newcommand\BO{\ensuremath{\mathbb{O}}} 140 | \newcommand\BP{\ensuremath{\mathbb{P}}} 141 | \newcommand\BQ{\ensuremath{\mathbb{Q}}} 142 | \newcommand\BR{\ensuremath{\mathbb{R}}} 143 | \newcommand\BS{\ensuremath{\mathbb{S}}} 144 | \newcommand\BT{\ensuremath{\mathbb{T}}} 145 | \newcommand\BU{\ensuremath{\mathbb{U}}} 146 | \newcommand\BV{\ensuremath{\mathbb{V}}} 147 | \newcommand\BW{\ensuremath{\mathbb{W}}} 148 | \newcommand\BX{\ensuremath{\mathbb{X}}} 149 | \newcommand\BY{\ensuremath{\mathbb{Y}}} 150 | \newcommand\BZ{\ensuremath{\mathbb{Z}}} 151 | \newcommand\balpha{\ensuremath{\mbox{\boldmath$\alpha$}}} 152 | \newcommand\bbeta{\ensuremath{\mbox{\boldmath$\beta$}}} 153 | \newcommand\btheta{\ensuremath{\mbox{\boldmath$\theta$}}} 154 | \newcommand\bphi{\ensuremath{\mbox{\boldmath$\phi$}}} 155 | \newcommand\bpi{\ensuremath{\mbox{\boldmath$\pi$}}} 156 | \newcommand\bpsi{\ensuremath{\mbox{\boldmath$\psi$}}} 157 | \newcommand\bmu{\ensuremath{\mbox{\boldmath$\mu$}}} 158 | % Basic 159 | \newcommand\T{\text} 160 | \newcommand\sign{\text{sign}} 161 | \newcommand\tr{\text{tr}} 162 | \newcommand\fig[1]{\begin{center} \includegraphics{#1} \end{center}} 163 | \newcommand\Fig[5]{\begin{figure}[tb] \begin{center} \includegraphics[scale=#2]{#1} \end{center} \longcaption{#4}{\label{fig:#3} #5} \end{figure}} 164 | \newcommand\FigTop[4]{\begin{figure}[t] \begin{center} \includegraphics[scale=#2]{#1} \end{center} \caption{\label{fig:#3} #4} \end{figure}} 165 | \newcommand\FigStar[4]{\begin{figure*}[tb] \begin{center} \includegraphics[scale=#2]{#1} \end{center} \caption{\label{fig:#3} #4} \end{figure*}} 166 | \newcommand\aside[1]{\quad\text{[#1]}} 167 | \newcommand\homework[3]{\title{#1} \author{#2} \date{#3} \maketitle} 168 | % Math 169 | \newcommand\argmin{\mathop{\text{argmin}}} 170 | \newcommand\argmax{\mathop{\text{argmax}}} 171 | \newcommand\p[1]{\ensuremath{\left( #1 \right)}} % Parenthesis () 172 | \newcommand\pb[1]{\ensuremath{\left[ #1 \right]}} % [] 173 | \newcommand\pc[1]{\ensuremath{\left\{ #1 \right\}}} % {} 174 | \newcommand\eval[2]{\ensuremath{\left. #1 \right|_{#2}}} % Evaluation 175 | \newcommand\inv[1]{\ensuremath{\frac{1}{#1}}} 176 | \newcommand\half{\ensuremath{\frac{1}{2}}} 177 | \newcommand\R{\ensuremath{\mathbb{R}}} % Real numbers 178 | \newcommand\Z{\ensuremath{\mathbb{Z}}} % Integers 179 | \newcommand\inner[2]{\ensuremath{\left< #1, #2 \right>}} % Inner product 180 | \newcommand\mat[2]{\ensuremath{\left(\begin{array}{#1}#2\end{array}\right)}} % Matrix 181 | \newcommand\eqn[1]{\begin{eqnarray} #1 \end{eqnarray}} % Equation (array) 182 | \newcommand\eqnl[2]{\begin{eqnarray} \label{eqn:#1} #2 \end{eqnarray}} % Equation (array) with label 183 | \newcommand\eqdef{\ensuremath{\stackrel{\rm def}{=}}} % Equal by definition 184 | %\newcommand{\1}{\mathbb{I}} % Indicator (don't use \mathbbm{1} because bbm is not TrueType though) 185 | \newcommand{\1}{\ensuremath{\mathbbm{1}}} 186 | \newcommand{\bone}{\mathbf{1}} % for vector one 187 | \newcommand{\bzero}{\mathbf{0}} % for vector zero 188 | \newcommand\refeqn[1]{(\ref{eqn:#1})} 189 | \newcommand\refeqns[2]{(\ref{eqn:#1}) and (\ref{eqn:#2})} 190 | \newcommand\refchp[1]{Chapter~\ref{chp:#1}} 191 | \newcommand\refsec[1]{Section~\ref{sec:#1}} 192 | \newcommand\refsecs[2]{Sections~\ref{sec:#1} and~\ref{sec:#2}} 193 | \newcommand\reffig[1]{Figure~\ref{fig:#1}} 194 | \newcommand\reffigs[2]{Figures~\ref{fig:#1} and~\ref{fig:#2}} 195 | \newcommand\reffigss[3]{Figures~\ref{fig:#1},~\ref{fig:#2}, and~\ref{fig:#3}} 196 | \newcommand\reffigsss[4]{Figures~\ref{fig:#1},~\ref{fig:#2},~\ref{fig:#3}, and~\ref{fig:#4}} 197 | \newcommand\reftab[1]{Table~\ref{tab:#1}} 198 | \newcommand\refapp[1]{Appendix~\ref{sec:#1}} 199 | \newcommand\refthm[1]{Theorem~\ref{thm:#1}} 200 | \newcommand\refthms[2]{Theorems~\ref{thm:#1} and~\ref{thm:#2}} 201 | \newcommand\reflem[1]{Lemma~\ref{lem:#1}} 202 | \newcommand\reflems[2]{Lemmas~\ref{lem:#1} and~\ref{lem:#2}} 203 | \newcommand\refprop[1]{Proposition~\ref{prop:#1}} 204 | \newcommand\refdef[1]{Definition~\ref{def:#1}} 205 | \newcommand\refcor[1]{Corollary~\ref{cor:#1}} 206 | \newcommand\refalg[1]{Algorithm~\ref{alg:#1}} 207 | \newcommand\Chapter[2]{\chapter{#2}\label{chp:#1}} 208 | \newcommand\Section[2]{\section{#2}\label{sec:#1}} 209 | \newcommand\Subsection[2]{\subsection{#2}\label{sec:#1}} 210 | \newcommand\Subsubsection[2]{\subsubsection{#2}\label{sec:#1}} 211 | %\newtheorem{definition}{Definition} 212 | %\newtheorem{assumption}{Assumption} 213 | %\newtheorem{proposition}{Proposition} 214 | %\newtheorem{theorem}{Theorem} 215 | %\newtheorem{lemma}{Lemma} 216 | %\newtheorem{corollary}{Corollary} 217 | % Probability 218 | \newcommand\cv{\ensuremath{\to}} % Convergence 219 | \newcommand\cvL{\ensuremath{\xrightarrow{\mathcal{L}}}} % Convergence in law 220 | \newcommand\cvd{\ensuremath{\xrightarrow{d}}} % Convergence in distribution 221 | \newcommand\cvP{\ensuremath{\xrightarrow{P}}} % Convergence in probability 222 | \newcommand\cvas{\ensuremath{\xrightarrow{a.s.}}} % Convergence almost surely 223 | \newcommand\eqdistrib{\ensuremath{\stackrel{d}{=}}} % Equal in distribution 224 | \newcommand\E[1]{\ensuremath{\mathbb{E}{\left[#1\right]}}} % Expectation 225 | \newcommand\Ex[2]{\ensuremath{\mathbb{E}_{#1}\left[#2\right]}} % Expectation 226 | %\newcommand\var{\ensuremath{\text{var}}} % Variance 227 | \newcommand\cov{\ensuremath{\text{cov}}} % Covariance 228 | \newcommand\diag{\ensuremath{\text{diag}}} % Diagnonal matrix 229 | \newcommand\cE[2]{\ensuremath{\E \left( #1 \mid #2 \right)}} % Conditional expectation 230 | \newcommand\KL[2]{\ensuremath{\T{KL}\left( #1 \,||\, #2 \right)}} % KL-divergence 231 | \newcommand\D[2]{\ensuremath{\bD\left( #1 \,||\, #2 \right)}} % KL-divergence 232 | 233 | % Utilities 234 | \newcommand\lte{\leq} 235 | \newcommand\gte{\geq} 236 | \newcommand\lone[1]{\ensuremath{\|#1\|_1}} 237 | \newcommand\ltwo[1]{\ensuremath{\|#1\|_2^2}} 238 | \newcommand\naive{na\"{\i}ve} 239 | \newcommand\Naive{Na\"{\i}ve} 240 | 241 | % Debug 242 | \usepackage{color} 243 | \newcommand{\tred}[1]{\textcolor{red}{#1}} 244 | \newcommand{\hl}[2]{\colorbox{#2}{#1}} 245 | \newcommand{\hly}[1]{\hl{yellow}{#1}} 246 | \def\todo#1{\hl{{\bf TODO:} #1}{yellow}} 247 | \def\needcite{\hl{{$^{\tt\small[citation\ needed]}$}}{blue}} 248 | \def\needfig{\hl{Figure X}{green}} 249 | \def\needtab{\hl{Table Y}{green}} 250 | \def\note#1{\hl{{\bf NOTE:} #1}{yellow}} 251 | \def\dome{\hl{{\bf TODO:} write me!}{yellow}} 252 | -------------------------------------------------------------------------------- /std.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/std.tex -------------------------------------------------------------------------------- /submit_form1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/submit_form1.pdf -------------------------------------------------------------------------------- /thesis.2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/thesis.2.0.pdf -------------------------------------------------------------------------------- /thesis.blg: -------------------------------------------------------------------------------- 1 | This is BibTeX, Version 0.99d (TeX Live 2014) 2 | Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 3 | The top-level auxiliary file: thesis.aux 4 | The style file: acl.bst 5 | Database file #1: ref.bib 6 | Warning--I didn't find a database entry for "alexis2018word" 7 | Warning--empty booktitle in dong2015multi 8 | Warning--empty journal in viegas2016google 9 | Warning--empty booktitle in Zhang2005AnEP 10 | You've used 143 entries, 11 | 2781 wiz_defined-function locations, 12 | 1215 strings with 31769 characters, 13 | and the built_in function-call counts, 95976 in all, are: 14 | = -- 7216 15 | > -- 6172 16 | < -- 60 17 | + -- 2058 18 | - -- 1873 19 | * -- 7360 20 | := -- 15210 21 | add.period$ -- 601 22 | call.type$ -- 143 23 | change.case$ -- 1428 24 | chr.to.int$ -- 129 25 | cite$ -- 146 26 | duplicate$ -- 7149 27 | empty$ -- 5920 28 | format.name$ -- 2326 29 | if$ -- 18764 30 | int.to.chr$ -- 15 31 | int.to.str$ -- 0 32 | missing$ -- 1414 33 | newline$ -- 717 34 | num.names$ -- 574 35 | pop$ -- 3232 36 | preamble$ -- 1 37 | purify$ -- 1192 38 | quote$ -- 0 39 | skip$ -- 2428 40 | stack$ -- 0 41 | substring$ -- 2916 42 | swap$ -- 3290 43 | text.length$ -- 32 44 | text.prefix$ -- 0 45 | top$ -- 0 46 | type$ -- 1287 47 | warning$ -- 3 48 | while$ -- 540 49 | width$ -- 0 50 | write$ -- 1780 51 | (There were 4 warnings) 52 | -------------------------------------------------------------------------------- /thesis.dvi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/thesis.dvi -------------------------------------------------------------------------------- /thesis.lof: -------------------------------------------------------------------------------- 1 | \addvspace {10\p@ } 2 | \addvspace {10\p@ } 3 | \contentsline {figure}{\numberline {2.1}{\ignorespaces An illustration of the comparison between the conventional {{\textsc {Seq2Seq}}}\xspace learning and {{\textsc {Seq2Seq}}}\xspace with attention mechanism for translating ``A B C D $\rightarrow $ X Y Z ''.\relax }}{5}{figure.caption.9} 4 | \addvspace {10\p@ } 5 | \contentsline {figure}{\numberline {3.1}{\ignorespaces The overall diagram of \textsc {CopyNet}. For simplicity, we omit some links for prediction. \relax }}{14}{figure.caption.10} 6 | \contentsline {figure}{\numberline {3.2}{\ignorespaces The illustration of the decoding probability $p(y_t|\cdot )$ as a 4-class classifier. \relax }}{17}{figure.caption.15} 7 | \contentsline {figure}{\numberline {3.3}{\ignorespaces Example output of \textsc {CopyNet} on the synthetic dataset. The heatmap represents the activations of the copy-mode over the input sequence (left) during the decoding process (bottom).\relax }}{21}{figure.caption.20} 8 | \contentsline {figure}{\numberline {3.4}{\ignorespaces Examples of \textsc {CopyNet} on LCSTS compared with RNN context. Word segmentation is applied on the input, where underlined are OOV words. The highlighted words (with different colors) are those words with copy-mode probability higher than the generate-mode. We also provide literal English translation for the document, the golden, and \textsc {CopyNet}, while omitting that for RNN context since the language is broken.\relax }}{24}{figure.caption.24} 9 | \contentsline {figure}{\numberline {3.5}{\ignorespaces Examples on the testing set of DS-II shown as the input text and golden, with the outputs of RNNSearch and CopyNet. Words in red rectangles are unseen in the training set. The highlighted words (with different colors) are those words with copy-mode probability higher than the generate-mode. Green cirles (meaning correct) and red cross (meaning incorrect) are given based on human judgment on whether the response is appropriate. \relax }}{25}{figure.caption.26} 10 | \addvspace {10\p@ } 11 | \contentsline {figure}{\numberline {4.1}{\ignorespaces The overall architecture of the proposed SEG-NMT. The shaded box includes the module which handles a set of translation pairs retrieved in the first stage. The heat maps represent the attention scores between the source sentences (left-to-right) and the corresponding translations (top-to-down).\relax }}{33}{figure.caption.29} 12 | \contentsline {figure}{\numberline {4.2}{\ignorespaces The improvement over the baseline by SEG-NMT on Fr$\to $En w.r.t. the fuzzy matching scores of one retrieved translation pair. \relax }}{43}{figure.caption.43} 13 | \contentsline {figure}{\numberline {4.3}{\ignorespaces The BLEU scores on Fr$\to $En using varying numbers of retrieved translation pairs during testing. The model was trained once. ``Adaptive'' refers to the proposed greedy selection in Alg.\nobreakspace {}\ref {algo1}. \relax }}{44}{figure.caption.44} 14 | \contentsline {figure}{\numberline {4.4}{\ignorespaces Three examples from the Fr$\to $En test set. For the proposed SEG-NMT model, one translation pair is retrieved from the training set. Each token in the translation by the proposed approach and its corresponded token (if it exists) in the retrieved pair are shaded in blue according to the gating variable $\zeta _t$ from Eq.\nobreakspace {}\textup {\hbox {\mathsurround \z@ \normalfont (\ignorespaces \ref {eq.shallow}\unskip \@@italiccorr )}}. In all, we show: (S) the source sentence. (RS) the source side of a retrieved pair. (RT) the target side of the retrieved pair. (A) the translation by the proposed approach. (B) the translation by the baseline. (T) the reference translation. \relax }}{45}{figure.caption.45} 15 | \addvspace {10\p@ } 16 | \contentsline {figure}{\numberline {5.1}{\ignorespaces BLEU scores reported on the test set for Ro-En. The amount of training data effects the translation performance dramatically using a single NMT model.\relax }}{49}{figure.caption.50} 17 | \contentsline {figure}{\numberline {5.2}{\ignorespaces An illustration of the proposed architecture of the ULR and MoLE. Shaded parts are trained within NMT model while unshaded parts are not changed during training.\relax }}{51}{figure.caption.54} 18 | \contentsline {figure}{\numberline {5.3}{\ignorespaces BLEU score vs corpus size\relax }}{59}{figure.caption.68} 19 | \contentsline {figure}{\numberline {5.4}{\ignorespaces BLEU score vs unknown tokens\relax }}{59}{figure.caption.68} 20 | \contentsline {figure}{\numberline {5.5}{\ignorespaces Three sets of examples on Ro-En translation with variant settings. \relax }}{61}{figure.caption.71} 21 | \contentsline {figure}{\numberline {5.6}{\ignorespaces The activation visualization of mixture of language experts module on one randomly selected Ro source sentences trained together with different auxiliary languages. Darker color means higher activation score. \relax }}{62}{figure.caption.72} 22 | \contentsline {figure}{\numberline {5.7}{\ignorespaces Performance comparison of Fine-tuning on 6K RO sentences.\relax }}{64}{figure.caption.77} 23 | \addvspace {10\p@ } 24 | \contentsline {figure}{\numberline {6.1}{\ignorespaces The graphical illustration of the training process of the proposed MetaNMT. For each episode, one task (language pair) is sampled for meta-learning. The boxes and arrows in blue are mainly involved in language-specific learning (\textsection \ref {sec:lsl}), and those in purple in meta-learning (\textsection \ref {sec:ml}).\relax }}{69}{figure.caption.81} 25 | \contentsline {figure}{\numberline {6.2}{\ignorespaces An intuitive illustration in which we use solid lines to represent the learning of initialization, and dashed lines to show the path of fine-tuning.\relax }}{71}{figure.caption.82} 26 | \contentsline {figure}{\numberline {6.3}{\ignorespaces BLEU scores reported on test sets for \{Ro, Lv, Fi, Tr\} to En, where each model is first learned from 6 source tasks (Es, Fr, It, Pt, De, Ru) and then fine-tuned on randomly sampled training sets with around 16,000 English tokens per run. The error bars show the standard deviation calculated from 5 runs.\relax }}{81}{figure.caption.90} 27 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {Ro-En}}}{81}{figure.caption.90} 28 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {Lv-En}}}{81}{figure.caption.90} 29 | \contentsline {subfigure}{\numberline {(c)}{\ignorespaces {Fi-En}}}{81}{figure.caption.90} 30 | \contentsline {subfigure}{\numberline {(d)}{\ignorespaces {Tr-En}}}{81}{figure.caption.90} 31 | \contentsline {figure}{\numberline {6.4}{\ignorespaces BLEU Scores w.r.t. the size of the target task's training set.\relax }}{82}{figure.caption.98} 32 | \contentsline {figure}{\numberline {6.5}{\ignorespaces The learning curves of BLEU scores on the validation task (Ro-En).\relax }}{83}{figure.caption.104} 33 | \addvspace {10\p@ } 34 | \contentsline {figure}{\numberline {7.1}{\ignorespaces Graphical illustrations of the trainable greedy decoding. The left panel shows a single step of the actor interacting with the underlying neural translation model, and The right panel the interaction among the underlying neural translation system (dashed-border boxes), actor (red-border boxes), and critic (blue-border boxes). The solid arrows indicate the forward pass, and the dashed yellow arrows the actor's backward pass. The dotted-border box shows the use of a reference translation.\relax }}{90}{figure.caption.111} 35 | \contentsline {figure}{\numberline {7.2}{\ignorespaces The plots draw the improvements by the trainable greedy decoding on the test set. The x-axes correspond to the objectives used to train trainable greedy decoding, and the y-axes to the changes in the achieved objectives (BLEU for the figures on the left, and negative perplexity on the right.) The top row (a) shows the cases when the trainable greedy decoder is used on its own, and the bottom row (b) when it is used together with beam search. When training and evaluation are both done with BLEU, we test the statistical significance \citep {koehn2004statistical}, and we mark significant cases with red stars ($p < 0.05$.) The underlying neural machine translation models achieved the BLEU scores of 14.49/16.20 for En-Cs, 18.90/21.20 for Cs-En, 18.97/21.33 for En-De, 21.63/24.46 for De-En, 16.97/19.68 for En-Ru, 21.06/23.34 for Ru-En, 7.53/8.82 for En-Fi and 9.79/11.03 for Fi-En (greedy/beam). \relax }}{101}{figure.caption.113} 36 | \contentsline {figure}{\numberline {7.3}{\ignorespaces Comparison of greedy BLEU scores whether using the critic-aware exploration or not on Ru-En Dataset. The green line means the BLEU score achieved by greedy decoding from the underlying NMT model.\relax }}{102}{figure.caption.117} 37 | \contentsline {figure}{\numberline {7.4}{\ignorespaces Three Ru-En examples in which the difference between the trainable greedy decoding (A) and the conventional greedy decoding (G) is large. Each step is marked with magenta, when the actor significantly influenced the output distribution.\relax }}{102}{figure.caption.118} 38 | \addvspace {10\p@ } 39 | \contentsline {figure}{\numberline {8.1}{\ignorespaces Translating ``A B C'' to ``X Y'' using autoregressive and non-autoregressive neural MT architectures. The latter generates all output tokens in parallel.\relax }}{105}{figure.caption.130} 40 | \contentsline {figure}{\numberline {8.2}{\ignorespaces The architecture of the NAT{}, where the black solid arrows represent differentiable connections and the purple dashed arrows are non-differentiable operations. Each sublayer inside the encoder and decoder stacks also includes layer normalization and a residual connection.\relax }}{107}{figure.caption.132} 41 | \contentsline {figure}{\numberline {8.3}{\ignorespaces BLEU scores on IWSLT development set as a function of sample size for noisy parallel decoding. NPD matches the performance of the other two decoding strategies after two samples, and exceeds the performance of the autoregressive teacher with around 1000.\relax }}{116}{figure.caption.143} 42 | \contentsline {figure}{\numberline {8.4}{\ignorespaces Two examples comparing translations produced by an autoregressive (AR) and non-autoregressive Transformer as well as the result of noisy parallel decoding with sample size 100. Repeated words are highlighted in gray.\relax }}{119}{figure.caption.152} 43 | \contentsline {figure}{\numberline {8.5}{\ignorespaces A Romanian--English example translated with noisy parallel decoding. At left are eight sampled fertility sequences from the encoder, represented with their corresponding decoder input sequences. Each of these values for the latent variable leads to a different possible output translation, shown at right. The autoregressive Transformer then picks the best translation, shown in red, a process which is much faster than directly using it to generate output.\relax }}{119}{figure.caption.153} 44 | \contentsline {figure}{\numberline {8.6}{\ignorespaces The schematic structure of training and inference for the NAT. The ``distilled data'' contains target sentences decoded by the autoregressive model and ground-truth source sentences.\relax }}{120}{figure.caption.154} 45 | \contentsline {figure}{\numberline {8.7}{\ignorespaces The translation latency, computed as the time to decode a single sentence without minibatching, for each sentence in the IWSLT development set as a function of its length. The autoregressive model has latency linear in the decoding length, while the latency of the NAT is nearly constant for typical lengths, even with NPD with sample size 10. When using NPD with sample size 100, the level of parallelism is enough to more than saturate the GPU, leading again to linear latencies.\relax }}{121}{figure.caption.155} 46 | \contentsline {figure}{\numberline {8.8}{\ignorespaces Learning curves for training and fine-tuning of the NAT{} on IWSLT. BLEU scores are on the development set.\relax }}{122}{figure.caption.156} 47 | \addvspace {10\p@ } 48 | \contentsline {figure}{\numberline {9.1}{\ignorespaces {Example output from the proposed framework in DE $\rightarrow $ EN simultaneous translation. The heat-map represents the soft alignment between the incoming source sentence (left, up-to-down) and the emitted translation (top, left-to-right). The length of each column represents the number of source words being waited for before emitting the translation. Best viewed when zoomed digitally.}\relax }}{124}{figure.caption.157} 49 | \contentsline {figure}{\numberline {9.2}{\ignorespaces {Illustration of the proposed framework: at each step, the NMT environment (left) computes a candidate translation. The recurrent agent (right) will the observation including the candidates and send back decisions--\textsc {read} or \textsc {write}.}\relax }}{126}{figure.caption.158} 50 | \contentsline {figure}{\numberline {9.3}{\ignorespaces {Illustrations of (A)\nobreakspace {}beam-search, (B)\nobreakspace {}simultaneous greedy decoding and (C)\nobreakspace {}simultaneous beam-search.}\relax }}{134}{figure.caption.169} 51 | \contentsline {figure}{\numberline {9.4}{\ignorespaces {Learning progress curves for variant delay targets on the validation dataset for EN $\rightarrow $ RU. Every time we only keep one target for one delay measure. For instance when using target AP, the coefficient of $\alpha $ in Eq.\nobreakspace {}\ref {eq_rd} will be set $0$.}\relax }}{135}{figure.caption.170} 52 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {BLEU (EN $\rightarrow $ RU)}}}{135}{figure.caption.170} 53 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {AP (EN $\rightarrow $ RU)}}}{135}{figure.caption.170} 54 | \contentsline {subfigure}{\numberline {(c)}{\ignorespaces {CW (EN $\rightarrow $ RU)}}}{135}{figure.caption.170} 55 | \contentsline {figure}{\numberline {9.5}{\ignorespaces {Delay\nobreakspace {}(AP) v.s. BLEU for both language pair--directions. The shown point-pairs are the results of simultaneous greedy decoding and beam-search (beam-size = 5) respectively with models trained for various delay targets: ($\color {green!70!blue}\blacktriangleleft \triangleleft $: CW=$8$, $\color {green!70!blue}\blacktriangle \triangle $: CW=$5$, $\color {green!70!blue}\blacklozenge \lozenge $: CW=$2$, $\color {red} \blacktriangleright \triangleright $: AP=$0.3$, $\color {red} \blacktriangledown \triangledown $: AP=$0.5$, $\color {red} \blacksquare \square $: AP=$0.7$)}. For each target, we select the model that maximizes the quality-to-delay ratio ($\frac {\text {BLEU}}{\text {AP}}$) on the validation set. The baselines are also plotted ($\color {blue}\bigstar $: WOS $\color {black}\bigstar ${\fontfamily {pzd}\fontencoding {U}\fontseries {m}\fontshape {n}\selectfont \char 73}: WUE, $\times $: WID, $+$: WIW).\relax }}{136}{figure.caption.171} 56 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {EN$\rightarrow $RU}}}{136}{figure.caption.171} 57 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {RU$\rightarrow $EN}}}{136}{figure.caption.171} 58 | \contentsline {subfigure}{\numberline {(c)}{\ignorespaces {EN$\rightarrow $DE}}}{136}{figure.caption.171} 59 | \contentsline {subfigure}{\numberline {(d)}{\ignorespaces {DE$\rightarrow $EN}}}{136}{figure.caption.171} 60 | \contentsline {figure}{\numberline {9.6}{\ignorespaces {Delay\nobreakspace {}(CW) v.s. BLEU score for EN $\rightarrow $ RU, ($\color {green!70!blue}\blacktriangleleft \triangleleft $: CW=$8$, $\color {green!70!blue}\blacktriangle \triangle $: CW=$5$, $\color {green!70!blue}\blacklozenge \lozenge $: CW=$2$, $\color {red} \blacktriangleright \triangleright $: AP=$0.3$, $\color {red} \blacktriangledown \triangledown $: AP=$0.5$, $\color {red} \blacksquare \square $: AP=$0.7$), against the baselines\nobreakspace {}($\color {blue}\bigstar $: WOS $\color {black}\bigstar $: WUE, $+$: SEG1, $\times $: SEG2).}\relax }}{137}{figure.caption.172} 61 | \contentsline {figure}{\numberline {9.7}{\ignorespaces {Comparison of DE$\rightarrow $EN examples using the proposed framework and usual NMT system respectively. Both the heatmaps share the same setting with Fig.\nobreakspace {}\ref {crop}}. The verb ``gedeckt'' is incorrectly translated in simultaneous translation.\relax }}{141}{figure.caption.180} 62 | \contentsline {subfigure}{\numberline {(a)}{\ignorespaces {Simultaneous Neural Machine Translation}}}{141}{figure.caption.180} 63 | \contentsline {subfigure}{\numberline {(b)}{\ignorespaces {Neural Machine Translation}}}{141}{figure.caption.180} 64 | \contentsline {figure}{\numberline {9.8}{\ignorespaces {Given the example input sentence (leftmost column), we show outputs by models trained for various delay targets. For these outputs, each row corresponds to one source word and represents the emitted words (maybe empty) after reading this word. The corresponding source and target words are in the same color for all model outputs.}\relax }}{142}{figure.caption.181} 65 | \addvspace {10\p@ } 66 | -------------------------------------------------------------------------------- /thesis.lot: -------------------------------------------------------------------------------- 1 | \addvspace {10\p@ } 2 | \addvspace {10\p@ } 3 | \addvspace {10\p@ } 4 | \contentsline {table}{\numberline {3.1}{\ignorespaces The test accuracy (\%) on synthetic data.\relax }}{21}{table.caption.19} 5 | \contentsline {table}{\numberline {3.2}{\ignorespaces Some statistics of the LCSTS dataset.\relax }}{22}{table.caption.22} 6 | \contentsline {table}{\numberline {3.3}{\ignorespaces Testing performance of LCSTS, where ``RNN" is canonical Enc-Dec, and ``RNN context" its attentive variant.\relax }}{23}{table.caption.23} 7 | \contentsline {table}{\numberline {3.4}{\ignorespaces The decoding accuracy on the two testing sets. Decoding is admitted success only when the answer is found exactly in the Top-K outputs. \relax }}{26}{table.caption.27} 8 | \addvspace {10\p@ } 9 | \contentsline {table}{\numberline {4.1}{\ignorespaces Statistics from the JRC-Acquis corpus. We use BPE subword symbols.\relax }}{41}{table.caption.38} 10 | \contentsline {table}{\numberline {4.2}{\ignorespaces The BLEU scores on JRC-Acquis corpus.\relax }}{42}{table.caption.41} 11 | \addvspace {10\p@ } 12 | \contentsline {table}{\numberline {5.1}{\ignorespaces Statistics of the available parallel resource in our experiments. All the languages are translated to English.\relax }}{57}{table.caption.61} 13 | \contentsline {table}{\numberline {5.2}{\ignorespaces Scores over variant source languages (6k sentences for Ro \& Lv, and 10k for Ko). ``Multi" means the Multi-lingual NMT baseline.\relax }}{59}{table.caption.67} 14 | \contentsline {table}{\numberline {5.3}{\ignorespaces BLEU scores evaluated on test set (6k), compared with ULR and MoLE. ``vanilla" is the standard NMT system trained only on Ro-En training set\relax }}{60}{table.caption.69} 15 | \addvspace {10\p@ } 16 | \contentsline {table}{\numberline {6.1}{\ignorespaces Statistics of full datasets of the target language pairs. BLEU scores on the dev and test sets are reported from a supervised Transformer model with the same architecture.\relax }}{75}{table.caption.89} 17 | \contentsline {table}{\numberline {6.2}{\ignorespaces BLEU Scores w.r.t. the source task set for all five target tasks.\relax }}{76}{table.caption.95} 18 | \contentsline {table}{\numberline {6.3}{\ignorespaces Sample translations for Tr-En and Ko-En highlight the impact of fine-tuning which results in syntactically better formed translations. We highlight tokens of interest in terms of reordering. \relax }}{78}{table.caption.102} 19 | \addvspace {10\p@ } 20 | \addvspace {10\p@ } 21 | \contentsline {table}{\numberline {8.1}{\ignorespaces BLEU scores on official test sets (\texttt {newstest2014} for WMT En-De and \texttt {newstest2016} for WMT En-Ro) or the development set for IWSLT. NAT models without NPD use argmax decoding. Latency is computed as the time to decode a single sentence without minibatching, averaged over the whole test set; decoding is implemented in PyTorch on a single NVIDIA Tesla P100.\relax }}{117}{table.caption.150} 22 | \contentsline {table}{\numberline {8.2}{\ignorespaces Ablation performance on the IWSLT development set. BLEU (T) refers to the BLEU score on a version of the development set that has been translated by the teacher model. An $\times $ indicates that fine-tuning caused that model to get worse. When uniform copying is used as the decoder inputs, the ground-truth target lengths are provided. All models use argmax decoding.\relax }}{118}{table.caption.151} 23 | \addvspace {10\p@ } 24 | \addvspace {10\p@ } 25 | -------------------------------------------------------------------------------- /thesis.out: -------------------------------------------------------------------------------- 1 | \BOOKMARK [0][-]{chapter*.1}{Abstract}{}% 1 2 | \BOOKMARK [0][-]{chapter*.2}{Acknowledgments}{}% 2 3 | \BOOKMARK [0][-]{chapter.1}{Introduction}{}% 3 4 | \BOOKMARK [1][-]{section.1.1}{A Brief Review of Neural Machine Translation}{chapter.1}% 4 5 | \BOOKMARK [1][-]{section.1.2}{Efficient Neural Machine Translation}{chapter.1}% 5 6 | \BOOKMARK [1][-]{section.1.3}{Thesis Outline}{chapter.1}% 6 7 | \BOOKMARK [0][-]{chapter.2}{Background}{}% 7 8 | \BOOKMARK [1][-]{section.2.1}{Modeling}{chapter.2}% 8 9 | \BOOKMARK [2][-]{subsection.2.1.1}{Neural Language Modeling}{section.2.1}% 9 10 | \BOOKMARK [2][-]{subsection.2.1.2}{Sequence-to-Sequence Learning}{section.2.1}% 10 11 | \BOOKMARK [2][-]{subsection.2.1.3}{Attention Mechanism}{section.2.1}% 11 12 | \BOOKMARK [1][-]{section.2.2}{Training}{chapter.2}% 12 13 | \BOOKMARK [2][-]{subsection.2.2.1}{Parallel Corpora}{section.2.2}% 13 14 | \BOOKMARK [2][-]{subsection.2.2.2}{Maximum Likelihood Learning}{section.2.2}% 14 15 | \BOOKMARK [2][-]{subsection.2.2.3}{Advanced Learning Algorithms}{section.2.2}% 15 16 | \BOOKMARK [2][-]{subsection.2.2.4}{Multilingual Training of Neural Machine Translation}{section.2.2}% 16 17 | \BOOKMARK [1][-]{section.2.3}{Decoding}{chapter.2}% 17 18 | \BOOKMARK [2][-]{subsection.2.3.1}{Greedy Decoding}{section.2.3}% 18 19 | \BOOKMARK [2][-]{subsection.2.3.2}{Beam Search}{section.2.3}% 19 20 | \BOOKMARK [2][-]{subsection.2.3.3}{Noisy Parallel Decoding}{section.2.3}% 20 21 | \BOOKMARK [1][-]{section.2.4}{Neural Machine Translation without RNNs}{chapter.2}% 21 22 | \BOOKMARK [2][-]{subsection.2.4.1}{The Transformer Model}{section.2.4}% 22 23 | \BOOKMARK [-1][-]{part.1}{I Data-Efficient Neural Machine Translation}{}% 23 24 | \BOOKMARK [0][-]{chapter.3}{Copying Mechanism}{part.1}% 24 25 | \BOOKMARK [1][-]{section.3.1}{CopyNet}{chapter.3}% 25 26 | \BOOKMARK [2][-]{subsection.3.1.1}{Model Overview}{section.3.1}% 26 27 | \BOOKMARK [2][-]{subsection.3.1.2}{Prediction with Copying and Generation}{section.3.1}% 27 28 | \BOOKMARK [2][-]{subsection.3.1.3}{State Update}{section.3.1}% 28 29 | \BOOKMARK [2][-]{subsection.3.1.4}{Hybrid Addressing of Short-Term Memory}{section.3.1}% 29 30 | \BOOKMARK [1][-]{section.3.2}{Learning}{chapter.3}% 30 31 | \BOOKMARK [1][-]{section.3.3}{Experiments}{chapter.3}% 31 32 | \BOOKMARK [2][-]{subsection.3.3.1}{Synthetic Dataset}{section.3.3}% 32 33 | \BOOKMARK [2][-]{subsection.3.3.2}{Text Summarization}{section.3.3}% 33 34 | \BOOKMARK [2][-]{subsection.3.3.3}{Single-turn Dialogue}{section.3.3}% 34 35 | \BOOKMARK [1][-]{section.3.4}{Related Work}{chapter.3}% 35 36 | \BOOKMARK [1][-]{section.3.5}{Conclusion and Future Work}{chapter.3}% 36 37 | \BOOKMARK [0][-]{chapter.4}{Non-Parametric Neural Machine Translation}{part.1}% 37 38 | \BOOKMARK [1][-]{section.4.1}{Introduction}{chapter.4}% 38 39 | \BOOKMARK [1][-]{section.4.2}{Background}{chapter.4}% 39 40 | \BOOKMARK [2][-]{subsection.4.2.1}{Neural Machine Translation}{section.4.2}% 40 41 | \BOOKMARK [2][-]{subsection.4.2.2}{Translation Memory}{section.4.2}% 41 42 | \BOOKMARK [1][-]{section.4.3}{Search Engine Guided Non-Parametric Neural Machine Translation}{chapter.4}% 42 43 | \BOOKMARK [2][-]{subsection.4.3.1}{Retrieval Stage}{section.4.3}% 43 44 | \BOOKMARK [2][-]{subsection.4.3.2}{Translation Stage}{section.4.3}% 44 45 | \BOOKMARK [2][-]{subsection.4.3.3}{Learning and Inference}{section.4.3}% 45 46 | \BOOKMARK [1][-]{section.4.4}{Related Work}{chapter.4}% 46 47 | \BOOKMARK [1][-]{section.4.5}{Experimental Settings}{chapter.4}% 47 48 | \BOOKMARK [1][-]{section.4.6}{Result and Analysis}{chapter.4}% 48 49 | \BOOKMARK [1][-]{section.4.7}{Conclusion}{chapter.4}% 49 50 | \BOOKMARK [0][-]{chapter.5}{Universal Neural Machine Translation}{part.1}% 50 51 | \BOOKMARK [1][-]{section.5.1}{Introduction}{chapter.5}% 51 52 | \BOOKMARK [1][-]{section.5.2}{Motivation}{chapter.5}% 52 53 | \BOOKMARK [2][-]{subsection.5.2.1}{Challenges}{section.5.2}% 53 54 | \BOOKMARK [1][-]{section.5.3}{Universal Neural Machine Translation}{chapter.5}% 54 55 | \BOOKMARK [2][-]{subsection.5.3.1}{Universal Lexical Representation \(ULR\)}{section.5.3}% 55 56 | \BOOKMARK [2][-]{subsection.5.3.2}{Mixture of Language Experts \(MoLE\)}{section.5.3}% 56 57 | \BOOKMARK [1][-]{section.5.4}{Experiments}{chapter.5}% 57 58 | \BOOKMARK [2][-]{subsection.5.4.1}{Settings}{section.5.4}% 58 59 | \BOOKMARK [2][-]{subsection.5.4.2}{Back-Translation}{section.5.4}% 59 60 | \BOOKMARK [2][-]{subsection.5.4.3}{Preliminary Experiments}{section.5.4}% 60 61 | \BOOKMARK [1][-]{section.5.5}{Results}{chapter.5}% 61 62 | \BOOKMARK [2][-]{subsection.5.5.1}{Ablation Study}{section.5.5}% 62 63 | \BOOKMARK [2][-]{subsection.5.5.2}{Qualitative Analysis}{section.5.5}% 63 64 | \BOOKMARK [2][-]{subsection.5.5.3}{Fine-tuning a Pre-trained Model}{section.5.5}% 64 65 | \BOOKMARK [1][-]{section.5.6}{Related Work}{chapter.5}% 65 66 | \BOOKMARK [1][-]{section.5.7}{Conclusion}{chapter.5}% 66 67 | \BOOKMARK [0][-]{chapter.6}{Meta Learning for Neural Machine Translation}{part.1}% 67 68 | \BOOKMARK [1][-]{section.6.1}{Introduction}{chapter.6}% 68 69 | \BOOKMARK [1][-]{section.6.2}{Background}{chapter.6}% 69 70 | \BOOKMARK [1][-]{section.6.3}{Meta Learning for Low-Resource Neural Machine Translation}{chapter.6}% 70 71 | \BOOKMARK [2][-]{subsection.6.3.1}{Learn: language-specific learning}{section.6.3}% 71 72 | \BOOKMARK [2][-]{subsection.6.3.2}{MetaLearn}{section.6.3}% 72 73 | \BOOKMARK [2][-]{subsection.6.3.3}{Unified Lexical Representation}{section.6.3}% 73 74 | \BOOKMARK [1][-]{section.6.4}{Experimental Settings}{chapter.6}% 74 75 | \BOOKMARK [2][-]{subsection.6.4.1}{Dataset}{section.6.4}% 75 76 | \BOOKMARK [2][-]{subsection.6.4.2}{Model and Learning}{section.6.4}% 76 77 | \BOOKMARK [1][-]{section.6.5}{Results}{chapter.6}% 77 78 | \BOOKMARK [1][-]{section.6.6}{Conclusion}{chapter.6}% 78 79 | \BOOKMARK [-1][-]{part.2}{II Decoding-Efficient Neural Machine Translation}{}% 79 80 | \BOOKMARK [0][-]{chapter.7}{Trainable Greedy Decoding}{part.2}% 80 81 | \BOOKMARK [1][-]{section.7.1}{Introduction}{chapter.7}% 81 82 | \BOOKMARK [1][-]{section.7.2}{Background}{chapter.7}% 82 83 | \BOOKMARK [2][-]{subsection.7.2.1}{Neural Machine Translation}{section.7.2}% 83 84 | \BOOKMARK [2][-]{subsection.7.2.2}{Decoding}{section.7.2}% 84 85 | \BOOKMARK [1][-]{section.7.3}{Trainable Greedy Decoding}{chapter.7}% 85 86 | \BOOKMARK [2][-]{subsection.7.3.1}{Many Decoding Objectives}{section.7.3}% 86 87 | \BOOKMARK [2][-]{subsection.7.3.2}{Trainable Greedy Decoding}{section.7.3}% 87 88 | \BOOKMARK [2][-]{subsection.7.3.3}{Learning and Challenges}{section.7.3}% 88 89 | \BOOKMARK [1][-]{section.7.4}{Deterministic Policy Gradient \040with Critic-Aware Actor Learning}{chapter.7}% 89 90 | \BOOKMARK [2][-]{subsection.7.4.1}{Deterministic Policy Gradient \040for Trainable Greedy Decoding}{section.7.4}% 90 91 | \BOOKMARK [2][-]{subsection.7.4.2}{Critic-Aware Actor Learning}{section.7.4}% 91 92 | \BOOKMARK [1][-]{section.7.5}{Experimental Settings}{chapter.7}% 92 93 | \BOOKMARK [2][-]{subsection.7.5.1}{Model Architectures and Learning}{section.7.5}% 93 94 | \BOOKMARK [2][-]{subsection.7.5.2}{Results and Analysis}{section.7.5}% 94 95 | \BOOKMARK [1][-]{section.7.6}{Conclusion}{chapter.7}% 95 96 | \BOOKMARK [0][-]{chapter.8}{Non-Autoregressive Neural Machine Translation}{part.2}% 96 97 | \BOOKMARK [1][-]{section.8.1}{Introduction}{chapter.8}% 97 98 | \BOOKMARK [1][-]{section.8.2}{Background}{chapter.8}% 98 99 | \BOOKMARK [2][-]{subsection.8.2.1}{Autoregressive Neural Machine Translation}{section.8.2}% 99 100 | \BOOKMARK [2][-]{subsection.8.2.2}{Non-Autoregressive Decoding}{section.8.2}% 100 101 | \BOOKMARK [2][-]{subsection.8.2.3}{The Multimodality Problem}{section.8.2}% 101 102 | \BOOKMARK [1][-]{section.8.3}{The Non-Autoregressive Transformer \(NAT\)}{chapter.8}% 102 103 | \BOOKMARK [2][-]{subsection.8.3.1}{Encoder Stack}{section.8.3}% 103 104 | \BOOKMARK [2][-]{subsection.8.3.2}{Decoder Stack}{section.8.3}% 104 105 | \BOOKMARK [2][-]{subsection.8.3.3}{Modeling Fertility to Tackle the Multimodality Problem}{section.8.3}% 105 106 | \BOOKMARK [2][-]{subsection.8.3.4}{Translation Predictor and the Decoding Process}{section.8.3}% 106 107 | \BOOKMARK [1][-]{section.8.4}{Training}{chapter.8}% 107 108 | \BOOKMARK [2][-]{subsection.8.4.1}{Sequence-Level Knowledge Distillation}{section.8.4}% 108 109 | \BOOKMARK [2][-]{subsection.8.4.2}{Fine-Tuning}{section.8.4}% 109 110 | \BOOKMARK [1][-]{section.8.5}{Experiments}{chapter.8}% 110 111 | \BOOKMARK [2][-]{subsection.8.5.1}{Experimental Settings}{section.8.5}% 111 112 | \BOOKMARK [2][-]{subsection.8.5.2}{Results}{section.8.5}% 112 113 | \BOOKMARK [2][-]{subsection.8.5.3}{Ablation Study}{section.8.5}% 113 114 | \BOOKMARK [1][-]{section.8.6}{Schematic and Analysis}{chapter.8}% 114 115 | \BOOKMARK [1][-]{section.8.7}{Conclusion}{chapter.8}% 115 116 | \BOOKMARK [0][-]{chapter.9}{Real-Time Neural Machine Translation}{part.2}% 116 117 | \BOOKMARK [1][-]{section.9.1}{Problem Definition}{chapter.9}% 117 118 | \BOOKMARK [1][-]{section.9.2}{Simultaneous Translation \040with Neural Machine Translation}{chapter.9}% 118 119 | \BOOKMARK [2][-]{subsection.9.2.1}{Environment}{section.9.2}% 119 120 | \BOOKMARK [2][-]{subsection.9.2.2}{Agent}{section.9.2}% 120 121 | \BOOKMARK [1][-]{section.9.3}{Learning}{chapter.9}% 121 122 | \BOOKMARK [2][-]{subsection.9.3.1}{Pre-training}{section.9.3}% 122 123 | \BOOKMARK [2][-]{subsection.9.3.2}{Reward Function}{section.9.3}% 123 124 | \BOOKMARK [2][-]{subsection.9.3.3}{Reinforcement Learning}{section.9.3}% 124 125 | \BOOKMARK [1][-]{section.9.4}{Simultaneous Beam Search}{chapter.9}% 125 126 | \BOOKMARK [1][-]{section.9.5}{Experiments}{chapter.9}% 126 127 | \BOOKMARK [2][-]{subsection.9.5.1}{Settings}{section.9.5}% 127 128 | \BOOKMARK [2][-]{subsection.9.5.2}{Quantitative Analysis}{section.9.5}% 128 129 | \BOOKMARK [2][-]{subsection.9.5.3}{Qualitative Analysis}{section.9.5}% 129 130 | \BOOKMARK [1][-]{section.9.6}{Related Work}{chapter.9}% 130 131 | \BOOKMARK [1][-]{section.9.7}{Conclusion}{chapter.9}% 131 132 | \BOOKMARK [0][-]{chapter.10}{Conclusion and Future Work}{part.2}% 132 133 | -------------------------------------------------------------------------------- /thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/thesis.pdf -------------------------------------------------------------------------------- /thesis.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,12pt,twoside]{book} 2 | \usepackage{suthesis} 3 | \usepackage[T1]{fontenc} 4 | \usepackage{calligra} 5 | %\documentstyle[12pt,suthesis]{report} 6 | 7 | 8 | 9 | % -- Imports -- 10 | % (general libraries) 11 | \usepackage{times,latexsym,amsfonts,amssymb,amsmath,graphicx,url,bbm,rotating} 12 | \usepackage{enumitem,multirow,hhline,stmaryrd,bussproofs,mathtools,siunitx,arydshln} 13 | \usepackage{booktabs,xcolor,csquotes,calligra} 14 | \usepackage{dsfont,xspace} 15 | \usepackage{rotating} 16 | \usepackage{titlesec} 17 | \usepackage{fitch} 18 | \usepackage{amsmath} 19 | 20 | % (inline references) 21 | \usepackage{natbib} 22 | % (tikz) 23 | \usepackage{tikz} 24 | \usepackage{tikz-dependency,pifont} 25 | \usetikzlibrary{shapes.arrows,chains,positioning,automata,trees,calc} 26 | \usetikzlibrary{patterns,matrix} 27 | \usetikzlibrary{decorations.pathmorphing,decorations.markings} 28 | % (print algorithms) 29 | %\usepackage[ruled,lined,linesnumbered]{algorithm2e} 30 | % (custom) 31 | \input std-macros.tex 32 | \input macros.tex 33 | % (paper compilation hacks) 34 | \def\newcite#1{\citet{#1}} 35 | \def\cite#1{\citep{#1}} 36 | %\def\newcite#1{\textcite{#1}} 37 | %\def\cite#1{\autocite{#1}} 38 | \definecolor{darkblue}{rgb}{0.0,0.0,0.4} 39 | 40 | % Common hyphenations 41 | \hyphenation{Text-Runner} 42 | \hyphenation{Verb-Ocean} 43 | 44 | \newenvironment{tightitemize}% 45 | {\begin{itemize}[topsep=0pt, partopsep=0pt] % 46 | \setlength{\parskip}{0pt}% 47 | }% 48 | {\end{itemize}} 49 | 50 | \newenvironment{dedication} 51 | {\vspace{10ex}\begin{quotation}\begin{center}\begin{em}} 52 | {\par\end{em}\end{center}\end{quotation}} 53 | %\bibliographystyle{plainnat} 54 | 55 | \usepackage{epigraph} 56 | \renewcommand{\epigraphsize}{\normalsize} 57 | \setlength{\epigraphwidth}{0.8\textwidth} 58 | 59 | \newcommand{\bleu}{{{\sc Bleu}}\xspace} 60 | \newcommand{\meteor}{{{\sc Meteor}}\xspace} 61 | \newcommand{\camready}[1]{{{#1}}} 62 | 63 | \newcommand{\eos}{{$\langle \mathrm{eos}\rangle$}\xspace} 64 | \newcommand{\bos}{{$\langle \mathrm{bos}\rangle$}\xspace} 65 | \newcommand{\unk}{{$\langle \mathrm{unk}\rangle$}\xspace} 66 | 67 | \newcommand{\copynet}{{\textsc{CopyNet}}\xspace} 68 | 69 | \newcommand{\uunk}{{\langle \mathrm{unk}\rangle}} 70 | \newcommand{\sts}{{{\textsc{Seq2Seq}}}\xspace} 71 | \newcommand{\dbleu}{{$\Delta${\sc Bleu}}\xspace} 72 | \newcommand{\fix}{\marginpar{FIX}} 73 | \newcommand{\new}{\marginpar{NEW}} 74 | \newcommand{\MINUS}{} 75 | \DeclareMathOperator{\softmax}{\text{softmax}} 76 | \DeclareMathOperator{\argsort}{\text{argsort}} 77 | \newcommand{\ssoftmax}{\mathop{\softmax}\limits} 78 | \usepackage{tabularx} 79 | \usepackage{tabulary} 80 | \usepackage{comment} 81 | 82 | \newcommand{\tabincell}[2]{\begin{tabular}{@{}#1@{}}#2\end{tabular}} 83 | 84 | 85 | 86 | % special packages 87 | \usepackage{algorithm} 88 | \usepackage{mathrsfs} 89 | \usepackage{enumitem} 90 | \usepackage{subfigure} 91 | \usepackage{amssymb} 92 | \usepackage[noend]{algpseudocode} 93 | \usepackage{wrapfig,lipsum,booktabs} 94 | \usepackage{booktabs, caption} 95 | \usepackage{tabularx} 96 | \usepackage{listliketab} 97 | \usepackage{lipsum} 98 | \usepackage{float} 99 | \usepackage{bm} 100 | \usepackage{relsize} 101 | \usepackage{footnote} 102 | \usepackage{bbding} 103 | 104 | \usepackage{kotex} 105 | \usepackage{textcomp} 106 | \usepackage{url} 107 | \usepackage{hyperref} 108 | \urlstyle{same} 109 | \newcommand{\alert}[1]{\textcolor{red}{#1}} 110 | \newcommand{\sidebysidecaption}[4]{% 111 | %\RaggedRight% 112 | \begin{minipage}[t]{#1} 113 | \vspace*{0pt} 114 | #3 115 | \end{minipage} 116 | \hfill% 117 | \begin{minipage}[t]{#2} 118 | \vspace*{0pt} 119 | #4 120 | \end{minipage}% 121 | } 122 | 123 | \newcommand*{\printdate}{% 124 | \ifcase \month\or January\or February\or March\or April\or May\or June\or July\or 125 | August\or September\or October\or November\or December\fi \space \number\year} 126 | 127 | \makeatletter 128 | \newcommand*{\rom}[1]{\expandafter\@slowromancap\romannumeral #1@} 129 | \makeatother 130 | 131 | % other definition used in copynet 132 | \input{MACPdef} 133 | 134 | % -- Document -- 135 | \begin{document} 136 | 137 | % Title 138 | \title{Efficient Neural Machine Translation} 139 | \author{Jiatao Gu} 140 | \submitdate{August 2018} 141 | \principaladviser{Prof. Victor O.K. Li} 142 | %\firstreader{Maggie Li} 143 | %\secondreader{Kaibin Huang} 144 | %\thirdreader{Laurence Young} 145 | 146 | % Preface 147 | \beforepreface 148 | \input dedication.tex 149 | \input declare.tex 150 | \input preface.tex 151 | \input ack.tex 152 | \afterpreface 153 | 154 | \part{Introduction} 155 | \chapter{Introduction} 156 | \label{intro} 157 | \input{chapters/intro} 158 | 159 | \chapter[Background]{Background: Neural Machine Translation} 160 | \label{background} 161 | \input{chapters/background} 162 | 163 | % -- Sections -- 164 | % Introduction 165 | % Bibliography 166 | \part{Data-Efficient Neural Machine Translation} 167 | \chapter[Copying Mechanism]{Incorporating Copying Mechanism in Sequence-to-Sequence Learning} 168 | \label{copy} 169 | \input{chapters/copynet} 170 | 171 | \chapter[Non-Parametric Neural Machine Translation]{Search-Engine Guided Non-Parametric Neural Machine Translation} 172 | \label{seg-nmt} 173 | \input{chapters/seg} 174 | 175 | \chapter[Universal Neural Machine Translation]{Universal Neural Machine Translation for Extremely Low Resource Languages} 176 | \label{ulr} 177 | \input{chapters/ulr} 178 | 179 | \chapter[Meta Learning for Neural Machine Translation]{Meta Learning for Low Resource Neural Machine Translation} 180 | \label{MetaNMT} 181 | \input{chapters/meta} 182 | 183 | \part{Decoding-Efficient Neural Machine Translation} 184 | \chapter[Trainable Greedy Decoding]{Trainable Greedy Decoding for Neural Machine Translation} 185 | \label{trainable} 186 | \input{chapters/trainable} 187 | 188 | 189 | \chapter{Non-Autoregressive Neural Machine Translation} 190 | \label{nat} 191 | \input{chapters/nat} 192 | 193 | \chapter{Simultaneous Neural Machine Translation} 194 | \label{simul} 195 | \input{chapters/simultrans} 196 | 197 | 198 | \part{Conclusion} 199 | \chapter{Conclusion} 200 | \label{conclusion} 201 | \input{chapters/conclusion} 202 | 203 | 204 | \bibliographystyle{acl} 205 | \bibliography{ref} 206 | 207 | 208 | \include{publications} 209 | \cleardoublepage % Force a break to an odd page 210 | 211 | \end{document} 212 | -------------------------------------------------------------------------------- /thesis_final_JIATAO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/Efficient-Neural-Machine-Translation/c301abbe9d70973950acde23c1bbd82d959d43e9/thesis_final_JIATAO.pdf -------------------------------------------------------------------------------- /tikz-dependency.sty: -------------------------------------------------------------------------------- 1 | % Copyright 2011 by Daniele Pighin 2 | % 3 | % This file may be distributed and/or modified 4 | % 5 | % 1. under the LaTeX Project Public License and/or 6 | % 2. under the GNU Public License. 7 | % 8 | % See the file doc/generic/tikz-dependency/licenses/LICENSE for more details. 9 | % 10 | % Changelog 11 | % 12 | % 1.1 (Jan 17, 2012) 13 | % - Added the possibility to use simple arcs as edges (keys: arc edge/segmented edge) 14 | % - Added "edge start x offset" and "edge end x offset" to fine tune edge start/end point position 15 | % - Added \depstyle to define styles more easily 16 | % - Added "simple" theme, based on the parse on page 1 of [Nivre and McDonald, 2008 ACL-HLT] 17 | % - Added instructions to draw bubble parses 18 | % - Fixed bug that would cause groupedges not to be straight under some circumstances; 19 | % 20 | % 1.0 (Nov 26, 2011) 21 | % - First public release 22 | 23 | \ProvidesPackage{tikz-dependency}[2011/01/08 v1.1 Macros to draw dependency trees] 24 | \NeedsTeXFormat{LaTeX2e}[1999/12/01] 25 | 26 | \RequirePackage{tikz,environ} 27 | \usetikzlibrary{matrix,arrows,backgrounds,calc,patterns,positioning,fit,shapes} 28 | 29 | \pgfdeclarelayer{depgroups} 30 | \pgfsetlayers{depgroups,main} 31 | 32 | \newcounter{dt@labelid} 33 | \newif\ifdt@linkbelow 34 | \newif\ifdt@arcedge 35 | \tikzset{ 36 | /depgraph/.cd, 37 | /depgraph/.search also = {/tikz}, 38 | dep id/.code = {\def\dt@depid{#1}}, 39 | dep id = dependency, 40 | % the distant of the horizontal line of the edge style for two adjacent words 41 | edge unit distance/.code = {\pgfmathsetlengthmacro{\dt@linkstep}{#1}}, 42 | edge unit distance = 3ex, 43 | % the horizontal offset defining the trapezoidal look 44 | edge slant/.code = {\pgfmathsetlengthmacro{\dt@linkslant}{#1}}, 45 | edge slant = 3pt, 46 | % the horizontal offset defining the starting/ending position of the edge style 47 | edge horizontal padding/.code = {\pgfmathsetlengthmacro{\dt@linkoffset}{#1}}, 48 | edge horizontal padding = 4pt, 49 | % vertical offset of the edge style from the word 50 | edge vertical padding/.code = {\pgfmathsetlengthmacro{\dt@linkdist}{#1}}, 51 | edge vertical padding = 0ex, 52 | % should links be placed above or below the sentence 53 | edge below/.is if = dt@linkbelow, 54 | edge below/.default=true, 55 | edge above/.code = {\dt@linkbelowfalse}, 56 | % arc edge 57 | arc edge/.is if = dt@arcedge, 58 | segmented edge/.code = {\dt@arcedgefalse}, 59 | arc angle/.store in = \dt@arcangle, 60 | arc angle = 60, 61 | % an explicit offset to compensate the x position of the edge 62 | edge start x offset/.code = {\pgfmathsetlengthmacro{\dt@EdgeStartOffsetX}{#1}}, 63 | edge start x offset = 0, 64 | edge end x offset/.code = {\pgfmathsetlengthmacro{\dt@EdgeEndOffsetX}{#1}}, 65 | edge end x offset = 0, 66 | % the layer that links should connect 67 | % (should be set to "1" for edge above, and to the number of rows 68 | % in the matrix for edge below) 69 | target layer/.store in = \dt@tgtlayer, 70 | target layer/.default = 0, 71 | target layer = 0, 72 | % styling options 73 | reserved/edge style/.style = {->, >=stealth, black, solid, rounded corners = 2, line cap = round, segmented edge}, 74 | edge style/.style = {reserved/edge style/.append style = {#1}}, 75 | reserved/label style/.style = { 76 | anchor = mid, 77 | draw, solid, 78 | black, 79 | scale = .7, 80 | text height = 1.5ex, text depth = 0.25ex, % needed to center text vertically 81 | inner sep=.5ex, 82 | outer sep = 0pt, 83 | rounded corners = 2pt, 84 | text = black, 85 | fill = white}, 86 | label style/.style = {reserved/label style/.append style = {#1}}, 87 | hide label/.style = {reserved/label style/.append style = {opacity = 0, text opacity = 0}}, 88 | show label/.style = {reserved/label style/.append style = {opacity = 1, text opacity = 1}}, 89 | text only label/.style = {reserved/label style/.append style = {opacity=0, text opacity=1}}, 90 | reserved/text style/.style = { 91 | text height=1.5ex, text depth = 0.25ex, % needed to center text vertically 92 | inner sep = .5ex}, 93 | text style/.style = {reserved/text style/.append style = {#1}}, 94 | % group styling 95 | reserved/group style/.style = { 96 | inner sep = 0, 97 | draw, solid, 98 | outer sep = .5ex, 99 | rounded corners = 2pt}, 100 | group style/.style = {reserved/group style/.append style = {#1}}, 101 | % themes for text 102 | text theme/.is choice, 103 | text theme/default/.style = {text style={black}}, 104 | text theme/brazil/.style = {text style={blue!60!black}}, 105 | text theme/iron/.style = {text style={black!80}}, 106 | text theme/copper/.style = {text style={brown!60!black}}, 107 | text theme/night/.style = {text style={black}}, 108 | text theme/grassy/.style = {text style={green!40!black}}, 109 | text theme/simple/.style = {text style={black}}, 110 | % themes for labels 111 | label theme/.is choice, 112 | label theme/default/.style = {label style={fill=white, draw=black}}, 113 | label theme/night/.style = {label style={text=white, fill=black, font=\bfseries}}, 114 | label theme/brazil/.style = {label style={thick, black, fill=yellow, text=black, font=\bfseries}}, 115 | label theme/iron/.style = {label style={top color=black!60, bottom color=black!80, draw=black!80, text=white, font=\bfseries}}, 116 | label theme/copper/.style = {label style={top color=brown!80!pink, bottom color=brown!60!black, draw=brown!80, text=white, font=\bfseries}}, 117 | label theme/grassy/.style = {label style={bottom color=green!60!black, top color=green!20!black, draw=green!40!black, text=white, font=\bfseries}}, 118 | label theme/simple/.style = {label style={draw=none,fill=none,above,font=\scriptsize}}, 119 | % themes for edges 120 | edge theme/.is choice, 121 | edge theme/default/.style = {edge style={thin,black}}, 122 | edge theme/night/.style = {edge style={thick}}, 123 | edge theme/brazil/.style = {edge style={thick,green!60!black}}, 124 | edge theme/iron/.style = {edge style={thick, black!80}}, 125 | edge theme/copper/.style = {edge style={thick, brown!80}}, 126 | edge theme/grassy/.style = {edge style={thick, green!40!black}}, 127 | edge theme/simple/.style = {arc edge, arc angle=79}, 128 | % themes styles 129 | theme/.style = {label theme = #1, edge theme = #1, text theme = #1}, 130 | } 131 | 132 | \newcommand{\depstyle}[2]{\tikzset{#1/.style = {/depgraph/.cd, #2}}} 133 | 134 | 135 | \newenvironment{dependency}[1][]{% 136 | \begin{tikzpicture}[/depgraph/.cd, #1] 137 | \begin{scope} 138 | }{% 139 | \end{scope}% 140 | \end{tikzpicture}% 141 | } 142 | 143 | \NewEnviron{deptext}[1][]{% 144 | \begin{scope} 145 | \matrix (\dt@depid)[% 146 | nodes = {/depgraph/reserved/text style}, 147 | column sep = 0, 148 | row sep = -.5ex, 149 | matrix of nodes, 150 | ampersand replacement = \&, 151 | nodes in empty cells, 152 | #1] { 153 | \BODY% 154 | };% 155 | \pgfmathtruncatemacro\dt@tmp{\pgfmatrixcurrentrow} 156 | \xdef\dt@numrows{\dt@tmp} 157 | \end{scope}% 158 | } 159 | 160 | \newcommand{\depkeys}[1]{\tikzset{/depgraph/.cd, #1}} 161 | 162 | \newcommand{\settgtlayer}{% 163 | \ifnum\dt@tgtlayer=0% 164 | \ifdt@linkbelow% 165 | \pgfmathtruncatemacro{\dt@tgtlayer}{\dt@numrows}% 166 | \else% 167 | \pgfmathtruncatemacro{\dt@tgtlayer}{1}% 168 | \fi% 169 | \fi% 170 | } 171 | 172 | \newcommand{\deproot}[3][]{% options, root offset, root reserved/label style 173 | \begin{scope} 174 | \depkeys{#1} 175 | \pgfmathsetmacro{\offa}{#2} 176 | \settgtlayer 177 | \def\anchorpoint{north} 178 | \ifdt@linkbelow 179 | \def\anchorpoint{south} 180 | \fi 181 | \def\source{\dt@depid-\dt@tgtlayer-#2} 182 | \pgfmathsetmacro{\distance}{\dt@linkstep * 4} 183 | \pgfmathsetlengthmacro{\dt@startdist}{\dt@linkdist} 184 | \ifdt@linkbelow 185 | \pgfmathsetmacro{\distance}{-(\dt@linkstep * 4)} 186 | \pgfmathsetlengthmacro{\dt@startdist}{-\dt@startdist} 187 | \fi 188 | \node at (\source) (\rootref) [yshift=\distance, /depgraph/.cd, reserved/label style, #1] {#3}; 189 | \draw [/depgraph/.cd, reserved/edge style, #1] (\rootref) -- ($(\source.\anchorpoint) + (0, \dt@startdist)$); 190 | \end{scope} 191 | } 192 | 193 | \newcommand{\depedge}[4][]{% options, source offset, target offset, dep name 194 | \begin{scope} 195 | \depkeys{#1} 196 | \pgfmathsetmacro{\offa}{#2} 197 | \pgfmathsetmacro{\offb}{#3} 198 | \settgtlayer 199 | \def\source{\wordref{\dt@tgtlayer}{#2}} 200 | \def\dest{\wordref{\dt@tgtlayer}{#3}} 201 | \def\depname{#4} 202 | \pgfmathsetlengthmacro{\distance}{abs(\offb - \offa)*\dt@linkstep} 203 | \groupedge[#1]{\source}{\dest}{\depname}{\distance} 204 | \end{scope}} 205 | 206 | \newlength{\xca} 207 | \newlength{\yca} 208 | \newlength{\xcb} 209 | \newlength{\ycb} 210 | 211 | \newcommand{\wordref}[2]{\dt@depid-#1-#2} 212 | 213 | \newcommand{\rootref}{\dt@depid-root} 214 | 215 | \newcommand{\matrixref}{\dt@depid} 216 | 217 | % \storelabelnode 218 | % 219 | % \edef#1 the name of the last reserved/label style, stored in \dt@lastlabel 220 | % 221 | % #1 - a macro 222 | \newcommand{\storelabelnode}[1]{\edef#1{\dt@lastlabel}} 223 | 224 | \newcommand{\storefirstcorner}[1]{\edef#1{\dt@lastlabel-edge-first-corner}} 225 | \newcommand{\storesecondcorner}[1]{\edef#1{\dt@lastlabel-edge-second-corner}} 226 | 227 | \newcommand{\wordgroup}[5][]{% options, layer, col-start, col-end, identifier 228 | \begin{scope}[/depgraph/.cd, #1] 229 | \pgfonlayer{depgroups} 230 | \node (#5) [fit = (\wordref{#2}{#3}) (\wordref{#2}{#4}), /depgraph/.cd, reserved/group style, #1] {}; 231 | \endpgfonlayer 232 | \end{scope} 233 | } 234 | 235 | \newcommand{\groupedge}[5][]{% options, source, target, depname, distance 236 | \begin{scope}[/depgraph/.cd, #1] 237 | \def\anchorpoint{north} 238 | \ifdt@linkbelow 239 | \def\anchorpoint{south} 240 | \fi 241 | \pgfextractx{\xca}{\pgfpointanchor{#2}{\anchorpoint}} 242 | \pgfextractx{\xcb}{\pgfpointanchor{#3}{\anchorpoint}} 243 | \pgfextracty{\yca}{\pgfpointanchor{#2}{\anchorpoint}} 244 | \pgfextracty{\ycb}{\pgfpointanchor{#3}{\anchorpoint}} 245 | \pgfmathsetlengthmacro{\ydiff}{\yca-\ycb} 246 | \ifdim\xca>\xcb 247 | \pgfmathsetlengthmacro{\doff}{-\dt@linkoffset} 248 | \pgfmathsetlengthmacro{\dslant}{-\dt@linkslant} 249 | \else 250 | \pgfmathsetlengthmacro{\doff}{\dt@linkoffset} 251 | \pgfmathsetlengthmacro{\dslant}{\dt@linkslant} 252 | \fi 253 | \addtocounter{dt@labelid}{1} 254 | \xdef\dt@lastlabel{\dt@depid-\the\value{dt@labelid}} 255 | \pgfmathsetlengthmacro{\dt@startdist}{\dt@linkdist} 256 | \pgfmathsetlengthmacro{\dt@enddist}{#5} 257 | \ifdt@linkbelow 258 | \pgfmathsetlengthmacro{\dt@startdist}{-\dt@startdist} 259 | \pgfmathsetlengthmacro{\dt@enddist}{-\dt@enddist} 260 | \fi 261 | 262 | % Calculate edge anchors. -edge-first-corner and -edge-second-corner are only 263 | % meaningful for segmented edge, but we also draw the nodes for arc edge, someone 264 | % may always find these anchors useful... 265 | \node (\dt@lastlabel-edge-origin) [coordinate] at ($(#2.\anchorpoint) + (\doff,\dt@startdist) + (\dt@EdgeStartOffsetX,0)$) {}; 266 | \node (\dt@lastlabel-edge-first-corner) [coordinate] at ($(\dt@lastlabel-edge-origin) + (\dslant,\dt@enddist) + (\dt@EdgeEndOffsetX,0)$) {}; 267 | \node (\dt@lastlabel-edge-second-corner) [coordinate] at ($(#3.\anchorpoint) + (-\dslant,\dt@enddist+\dt@startdist+\ydiff)$) {}; 268 | \node (\dt@lastlabel-edge-endpoint) [coordinate] at ($(#3.\anchorpoint) + (0,\dt@startdist)$) {}; 269 | 270 | \ifdt@arcedge 271 | % link above, left to right 272 | \pgfmathsetmacro{\dt@arcin}{180-\dt@arcangle} 273 | \pgfmathsetmacro{\dt@arcout}{\dt@arcangle} 274 | \ifdt@linkbelow 275 | % link below 276 | \pgfmathsetmacro{\dt@arcin}{-\dt@arcin} 277 | \pgfmathsetmacro{\dt@arcout}{-\dt@arcout} 278 | \fi 279 | \ifdim\xca>\xcb 280 | % right to left 281 | \pgfmathsetmacro{\dt@temp}{\dt@arcin} 282 | \pgfmathsetmacro{\dt@arcin}{\dt@arcout} 283 | \pgfmathsetmacro{\dt@arcout}{\dt@temp} 284 | \fi 285 | %\draw [/depgraph/.cd, reserved/edge style, rounded corners = #5/5, #1] 286 | \draw [out=\dt@arcout, in=\dt@arcin,/depgraph/.cd, reserved/edge style, rounded corners = #5/5, #1] 287 | (\dt@lastlabel-edge-origin) 288 | to node (\dt@lastlabel) [/depgraph/.cd, #1, reserved/label style] {#4} 289 | (\dt@lastlabel-edge-endpoint); 290 | 291 | \else 292 | \draw [/depgraph/.cd, reserved/edge style, rounded corners = #5/5, #1] 293 | (\dt@lastlabel-edge-origin) -- 294 | (\dt@lastlabel-edge-first-corner) -- 295 | (\dt@lastlabel-edge-second-corner) -- 296 | (\dt@lastlabel-edge-endpoint); 297 | \node (\dt@lastlabel) [/depgraph/.cd, #1, reserved/label style] at 298 | ($ .5*(\dt@lastlabel-edge-second-corner) + .5*(\dt@lastlabel-edge-first-corner) $) 299 | {#4}; 300 | \fi 301 | \end{scope} 302 | } 303 | 304 | --------------------------------------------------------------------------------