├── LICENSE ├── README.md ├── data ├── README.md ├── build_dictionary.py ├── extract_files.py ├── length.py ├── merge.sh ├── multi-bleu.perl ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.el │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.pt │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.sv │ └── nonbreaking_prefix.ta ├── preprocess.sh ├── scan_example.py ├── setup_cluster_env.sh ├── setup_local_env.sh ├── shuffle.py ├── strip_sgml.py ├── tokenize_all.sh └── tokenizer.perl ├── docs ├── cgru.pdf └── cgru.tex ├── session0 ├── data_iterator.py ├── lm.py ├── train.sh └── train_lm.py ├── session1 ├── README.md ├── data_iterator.py ├── encode.py ├── nmt.py ├── test.sh ├── train.sh ├── train_all.sh ├── train_nmt.py ├── train_nmt_all.py └── translate.py ├── session2 ├── README.md ├── data_iterator.py ├── nmt.py ├── test.sh ├── train.sh ├── train_all.sh ├── train_nmt.py ├── train_nmt_all.py └── translate.py └── session3 ├── README.md ├── data_iterator.py ├── lm.py ├── nmt.py ├── rescore_with_lm.py ├── score_nbest.sh ├── test.sh ├── train.sh ├── train_all.sh ├── train_nmt.py ├── train_nmt_all.py └── translate.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Orhan Firat and New York University (Kyunghyun Cho) 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of dl4mt-tutorial nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dl4mt-material 2 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Data pre-processing related scripts and utilities. 2 | 3 | #### Setup 4 | Easiest way to setup your environment: 5 | 6 | ```bash 7 | $ cd ~; mkdir codes; cd codes 8 | $ git clone https://github.com/nyu-dl/dl4mt-tutorial 9 | $ cd dl4mt-tutorial/data 10 | $ ./setup_local_env.sh 11 | ``` 12 | 13 | which will first clone this repository under `~/codes/dl4mt-tutorial` 14 | and then calls the `setup_local_env.sh` script to retrieve example data, 15 | and preprocesses it. 16 | 17 | #### Pre-processing 18 | Following steps are executed by `setup_local_env.sh`: 19 | 1. Clone `dl4mt-tutorial` repository (if not cloned already) 20 | 2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development) 21 | 3. Preprocess training and development sets 22 | * Tokenize using moses tokenizer 23 | * Shuffle training set for SGD 24 | * Build source and target dictionaries 25 | 26 | #### Pre-processing with subword-units 27 | If you want to use subword-units (eg. [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)) for source and target tokens, simply call: 28 | ```bash 29 | $ ./setup_local_env.sh -b 30 | ``` 31 | which will replace the third step above, and execute the following steps: 32 | 1. Clone `dl4mt-tutorial` repository (if not cloned already) 33 | 2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development) 34 | 3. Preprocess training and development sets (`preprocess.sh`) 35 | * Tokenize source and target side of all bitext 36 | * Learn BPE-codes for both source and target side using training sets 37 | * Encode source and target side using the learned codes 38 | * Shuffle training set for SGD 39 | * Build source and target dictionaries 40 | 41 | In case you want to preprocess your own data using BPE, you can use `preprocess.sh` script directly. 42 | 43 | For the usage and more details, please check the comments in the scripts. 44 | -------------------------------------------------------------------------------- /data/build_dictionary.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy 4 | 5 | try: 6 | import cPickle as pkl 7 | except: 8 | import pickle as pkl 9 | 10 | import sys 11 | import fileinput 12 | 13 | from collections import OrderedDict 14 | 15 | def main(): 16 | for filename in sys.argv[1:]: 17 | print('Processing', filename) 18 | word_freqs = OrderedDict() 19 | with open(filename, 'r') as f: 20 | for line in f: 21 | words_in = line.strip().split(' ') 22 | for w in words_in: 23 | if w not in word_freqs: 24 | word_freqs[w] = 0 25 | word_freqs[w] += 1 26 | words = list(word_freqs.keys()) 27 | freqs = list(word_freqs.values()) 28 | 29 | sorted_idx = numpy.argsort(freqs) 30 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 31 | 32 | worddict = OrderedDict() 33 | worddict['eos'] = 0 34 | worddict['UNK'] = 1 35 | for ii, ww in enumerate(sorted_words): 36 | worddict[ww] = ii+2 37 | 38 | with open('%s.pkl'%filename, 'wb') as f: 39 | pkl.dump(worddict, f) 40 | 41 | print('Done') 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /data/extract_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import tarfile 7 | 8 | TRAIN_DATA_URL = 'http://www.statmt.org/europarl/v7/fr-en.tgz' 9 | VALID_DATA_URL = 'http://matrix.statmt.org/test_sets/newstest2011.tgz' 10 | 11 | parser = argparse.ArgumentParser( 12 | description=""" 13 | This script donwloads parallel corpora given source and target pair language 14 | indicators. Adapted from, 15 | https://github.com/orhanf/blocks-examples/tree/master/machine_translation 16 | """, formatter_class=argparse.RawTextHelpFormatter) 17 | parser.add_argument("-s", "--source", type=str, help="Source language", 18 | default="fr") 19 | parser.add_argument("-t", "--target", type=str, help="Target language", 20 | default="en") 21 | parser.add_argument("--source-dev", type=str, default="newstest2011.fr", 22 | help="Source language dev filename") 23 | parser.add_argument("--target-dev", type=str, default="newstest2011.en", 24 | help="Target language dev filename") 25 | parser.add_argument("--outdir", type=str, default=".", 26 | help="Output directory") 27 | 28 | 29 | def extract_tar_file_to(file_to_extract, extract_into, names_to_look): 30 | extracted_filenames = [] 31 | try: 32 | logger.info("Extracting file [{}] into [{}]" 33 | .format(file_to_extract, extract_into)) 34 | tar = tarfile.open(file_to_extract, 'r') 35 | src_trg_files = [ff for ff in tar.getnames() 36 | if any([ff.find(nn) > -1 for nn in names_to_look])] 37 | if not len(src_trg_files): 38 | raise ValueError("[{}] pair does not exist in the archive!" 39 | .format(src_trg_files)) 40 | for item in tar: 41 | # extract only source-target pair 42 | if item.name in src_trg_files: 43 | file_path = os.path.join(extract_into, item.path) 44 | if not os.path.exists(file_path): 45 | logger.info("...extracting [{}] into [{}]" 46 | .format(item.name, file_path)) 47 | tar.extract(item, extract_into) 48 | else: 49 | logger.info("...file exists [{}]".format(file_path)) 50 | extracted_filenames.append( 51 | os.path.join(extract_into, item.path)) 52 | except Exception as e: 53 | logger.error("{}".format(str(e))) 54 | return extracted_filenames 55 | 56 | 57 | def main(): 58 | train_data_file = os.path.join(args.outdir, 'train_data.tgz') 59 | valid_data_file = os.path.join(args.outdir, 'valid_data.tgz') 60 | 61 | # Download europarl v7 and extract it 62 | extract_tar_file_to( 63 | train_data_file, os.path.dirname(train_data_file), 64 | ["{}-{}".format(args.source, args.target)]) 65 | 66 | # Download development set and extract it 67 | extract_tar_file_to( 68 | valid_data_file, os.path.dirname(valid_data_file), 69 | [args.source_dev, args.target_dev]) 70 | 71 | 72 | if __name__ == "__main__": 73 | 74 | logging.basicConfig(level=logging.INFO) 75 | logger = logging.getLogger('prepare_data') 76 | 77 | args = parser.parse_args() 78 | main() 79 | -------------------------------------------------------------------------------- /data/length.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy 4 | import sys 5 | 6 | for name in sys.argv[1:]: 7 | lens = [] 8 | with open(name, 'r') as f: 9 | for ll in f: 10 | lens.append(len(ll.strip().split(' '))) 11 | print(name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens)) 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /data/merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script merges all the bitext files in the current directory. 3 | # Source side files are concatenated into all_[src]-[trg].[src] 4 | # Target side files are concatenated into all_[src]-[trg].[trg] 5 | 6 | if [ "$#" -ne 3 ]; then 7 | echo "" 8 | echo "Usage: $0 src trg path_to_data" 9 | echo "" 10 | exit 1 11 | fi 12 | 13 | SRC=$1 14 | TRG=$2 15 | 16 | DATA_DIR=$3 17 | 18 | FSRC=${DATA_DIR}/all_${1}-${2}.${1} 19 | FTRG=${DATA_DIR}/all_${1}-${2}.${2} 20 | 21 | echo "" > $FSRC 22 | for F in ${DATA_DIR}/*${1}-${2}.${1} 23 | do 24 | if [ "$F" = "$FSRC" ]; then 25 | echo "pass" 26 | else 27 | cat $F >> $FSRC 28 | fi 29 | done 30 | 31 | 32 | echo "" > $FTRG 33 | for F in ${DATA_DIR}/*${1}-${2}.${2} 34 | do 35 | if [ "$F" = "$FTRG" ]; then 36 | echo "pass" 37 | else 38 | cat $F >> $FTRG 39 | fi 40 | done 41 | -------------------------------------------------------------------------------- /data/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | sub add_to_ref { 35 | my ($file,$REF) = @_; 36 | my $s=0; 37 | open(REF,$file) or die "Can't read $file"; 38 | while() { 39 | chop; 40 | push @{$$REF[$s++]}, $_; 41 | } 42 | close(REF); 43 | } 44 | 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 46 | my $s=0; 47 | while() { 48 | chop; 49 | $_ = lc if $lowercase; 50 | my @WORD = split; 51 | my %REF_NGRAM = (); 52 | my $length_translation_this_sentence = scalar(@WORD); 53 | my ($closest_diff,$closest_length) = (9999,9999); 54 | foreach my $reference (@{$REF[$s]}) { 55 | # print "$s $_ <=> $reference\n"; 56 | $reference = lc($reference) if $lowercase; 57 | my @WORD = split(' ',$reference); 58 | my $length = scalar(@WORD); 59 | my $diff = abs($length_translation_this_sentence-$length); 60 | if ($diff < $closest_diff) { 61 | $closest_diff = $diff; 62 | $closest_length = $length; 63 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 64 | } elsif ($diff == $closest_diff) { 65 | $closest_length = $length if $length < $closest_length; 66 | # from two references with the same closeness to me 67 | # take the *shorter* into account, not the "first" one. 68 | } 69 | for(my $n=1;$n<=4;$n++) { 70 | my %REF_NGRAM_N = (); 71 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 72 | my $ngram = "$n"; 73 | for(my $w=0;$w<$n;$w++) { 74 | $ngram .= " ".$WORD[$start+$w]; 75 | } 76 | $REF_NGRAM_N{$ngram}++; 77 | } 78 | foreach my $ngram (keys %REF_NGRAM_N) { 79 | if (!defined($REF_NGRAM{$ngram}) || 80 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 81 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 82 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 83 | } 84 | } 85 | } 86 | } 87 | $length_translation += $length_translation_this_sentence; 88 | $length_reference += $closest_length; 89 | for(my $n=1;$n<=4;$n++) { 90 | my %T_NGRAM = (); 91 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 92 | my $ngram = "$n"; 93 | for(my $w=0;$w<$n;$w++) { 94 | $ngram .= " ".$WORD[$start+$w]; 95 | } 96 | $T_NGRAM{$ngram}++; 97 | } 98 | foreach my $ngram (keys %T_NGRAM) { 99 | $ngram =~ /^(\d+) /; 100 | my $n = $1; 101 | # my $corr = 0; 102 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 103 | $TOTAL[$n] += $T_NGRAM{$ngram}; 104 | if (defined($REF_NGRAM{$ngram})) { 105 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 106 | $CORRECT[$n] += $T_NGRAM{$ngram}; 107 | # $corr = $T_NGRAM{$ngram}; 108 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 109 | } 110 | else { 111 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 112 | # $corr = $REF_NGRAM{$ngram}; 113 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 114 | } 115 | } 116 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 117 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 118 | } 119 | } 120 | $s++; 121 | } 122 | my $brevity_penalty = 1; 123 | my $bleu = 0; 124 | 125 | my @bleu=(); 126 | 127 | for(my $n=1;$n<=4;$n++) { 128 | if (defined ($TOTAL[$n])){ 129 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 130 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 131 | }else{ 132 | $bleu[$n]=0; 133 | } 134 | } 135 | 136 | if ($length_reference==0){ 137 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 138 | exit(1); 139 | } 140 | 141 | if ($length_translation<$length_reference) { 142 | $brevity_penalty = exp(1-$length_reference/$length_translation); 143 | } 144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 145 | my_log( $bleu[2] ) + 146 | my_log( $bleu[3] ) + 147 | my_log( $bleu[4] ) ) / 4) ; 148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 149 | 100*$bleu, 150 | 100*$bleu[1], 151 | 100*$bleu[2], 152 | 100*$bleu[3], 153 | 100*$bleu[4], 154 | $brevity_penalty, 155 | $length_translation / $length_reference, 156 | $length_translation, 157 | $length_reference; 158 | 159 | sub my_log { 160 | return -9999999999 unless $_[0]; 161 | return log($_[0]); 162 | } 163 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.el: -------------------------------------------------------------------------------- 1 | # Sigle letters in upper-case are usually abbreviations of names 2 | Α 3 | Β 4 | Γ 5 | Δ 6 | Ε 7 | Ζ 8 | Η 9 | Θ 10 | Ι 11 | Κ 12 | Λ 13 | Μ 14 | Ν 15 | Ξ 16 | Ο 17 | Π 18 | Ρ 19 | Σ 20 | Τ 21 | Υ 22 | Φ 23 | Χ 24 | Ψ 25 | Ω 26 | 27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content). 28 | Άθαν 29 | Έγχρ 30 | Έκθ 31 | Έσδ 32 | Έφ 33 | Όμ 34 | Α΄Έσδρ 35 | Α΄Έσδ 36 | Α΄Βασ 37 | Α΄Θεσ 38 | Α΄Ιω 39 | Α΄Κορινθ 40 | Α΄Κορ 41 | Α΄Μακκ 42 | Α΄Μακ 43 | Α΄Πέτρ 44 | Α΄Πέτ 45 | Α΄Παραλ 46 | Α΄Πε 47 | Α΄Σαμ 48 | Α΄Τιμ 49 | Α΄Χρον 50 | Α΄Χρ 51 | Α.Β.Α 52 | Α.Β 53 | Α.Ε 54 | Α.Κ.Τ.Ο 55 | Αέθλ 56 | Αέτ 57 | Αίλ.Δ 58 | Αίλ.Τακτ 59 | Αίσ 60 | Αββακ 61 | Αβυδ 62 | Αβ 63 | Αγάκλ 64 | Αγάπ 65 | Αγάπ.Αμαρτ.Σ 66 | Αγάπ.Γεωπ 67 | Αγαθάγγ 68 | Αγαθήμ 69 | Αγαθιν 70 | Αγαθοκλ 71 | Αγαθρχ 72 | Αγαθ 73 | Αγαθ.Ιστ 74 | Αγαλλ 75 | Αγαπητ 76 | Αγγ 77 | Αγησ 78 | Αγλ 79 | Αγορ.Κ 80 | Αγρο.Κωδ 81 | Αγρ.Εξ 82 | Αγρ.Κ 83 | Αγ.Γρ 84 | Αδριαν 85 | Αδρ 86 | Αετ 87 | Αθάν 88 | Αθήν 89 | Αθήν.Επιγρ 90 | Αθήν.Επιτ 91 | Αθήν.Ιατρ 92 | Αθήν.Μηχ 93 | Αθανάσ 94 | Αθαν 95 | Αθηνί 96 | Αθηναγ 97 | Αθηνόδ 98 | Αθ 99 | Αθ.Αρχ 100 | Αιλ 101 | Αιλ.Επιστ 102 | Αιλ.ΖΙ 103 | Αιλ.ΠΙ 104 | Αιλ.απ 105 | Αιμιλ 106 | Αιν.Γαζ 107 | Αιν.Τακτ 108 | Αισχίν 109 | Αισχίν.Επιστ 110 | Αισχ 111 | Αισχ.Αγαμ 112 | Αισχ.Αγ 113 | Αισχ.Αλ 114 | Αισχ.Ελεγ 115 | Αισχ.Επτ.Θ 116 | Αισχ.Ευμ 117 | Αισχ.Ικέτ 118 | Αισχ.Ικ 119 | Αισχ.Περσ 120 | Αισχ.Προμ.Δεσμ 121 | Αισχ.Πρ 122 | Αισχ.Χοηφ 123 | Αισχ.Χο 124 | Αισχ.απ 125 | ΑιτΕ 126 | Αιτ 127 | Αλκ 128 | Αλχιας 129 | Αμ.Π.Ο 130 | Αμβ 131 | Αμμών 132 | Αμ. 133 | Αν.Πειθ.Συμβ.Δικ 134 | Ανακρ 135 | Ανακ 136 | Αναμν.Τόμ 137 | Αναπλ 138 | Ανδ 139 | Ανθλγος 140 | Ανθστης 141 | Αντισθ 142 | Ανχης 143 | Αν 144 | Αποκ 145 | Απρ 146 | Απόδ 147 | Απόφ 148 | Απόφ.Νομ 149 | Απ 150 | Απ.Δαπ 151 | Απ.Διατ 152 | Απ.Επιστ 153 | Αριθ 154 | Αριστοτ 155 | Αριστοφ 156 | Αριστοφ.Όρν 157 | Αριστοφ.Αχ 158 | Αριστοφ.Βάτρ 159 | Αριστοφ.Ειρ 160 | Αριστοφ.Εκκλ 161 | Αριστοφ.Θεσμ 162 | Αριστοφ.Ιππ 163 | Αριστοφ.Λυσ 164 | Αριστοφ.Νεφ 165 | Αριστοφ.Πλ 166 | Αριστοφ.Σφ 167 | Αριστ 168 | Αριστ.Αθ.Πολ 169 | Αριστ.Αισθ 170 | Αριστ.Αν.Πρ 171 | Αριστ.Ζ.Ι 172 | Αριστ.Ηθ.Ευδ 173 | Αριστ.Ηθ.Νικ 174 | Αριστ.Κατ 175 | Αριστ.Μετ 176 | Αριστ.Πολ 177 | Αριστ.Φυσιογν 178 | Αριστ.Φυσ 179 | Αριστ.Ψυχ 180 | Αριστ.Ρητ 181 | Αρμεν 182 | Αρμ 183 | Αρχ.Εκ.Καν.Δ 184 | Αρχ.Ευβ.Μελ 185 | Αρχ.Ιδ.Δ 186 | Αρχ.Νομ 187 | Αρχ.Ν 188 | Αρχ.Π.Ε 189 | Αρ 190 | Αρ.Φορ.Μητρ 191 | Ασμ 192 | Ασμ.ασμ 193 | Αστ.Δ 194 | Αστ.Χρον 195 | Ασ 196 | Ατομ.Γνωμ 197 | Αυγ 198 | Αφρ 199 | Αχ.Νομ 200 | Α 201 | Α.Εγχ.Π 202 | Α.Κ.΄Υδρας 203 | Β΄Έσδρ 204 | Β΄Έσδ 205 | Β΄Βασ 206 | Β΄Θεσ 207 | Β΄Ιω 208 | Β΄Κορινθ 209 | Β΄Κορ 210 | Β΄Μακκ 211 | Β΄Μακ 212 | Β΄Πέτρ 213 | Β΄Πέτ 214 | Β΄Πέ 215 | Β΄Παραλ 216 | Β΄Σαμ 217 | Β΄Τιμ 218 | Β΄Χρον 219 | Β΄Χρ 220 | Β.Ι.Π.Ε 221 | Β.Κ.Τ 222 | Β.Κ.Ψ.Β 223 | Β.Μ 224 | Β.Ο.Α.Κ 225 | Β.Ο.Α 226 | Β.Ο.Δ 227 | Βίβλ 228 | Βαρ 229 | ΒεΘ 230 | Βι.Περ 231 | Βιπερ 232 | Βιργ 233 | Βλγ 234 | Βούλ 235 | Βρ 236 | Γ΄Βασ 237 | Γ΄Μακκ 238 | ΓΕΝμλ 239 | Γέν 240 | Γαλ 241 | Γεν 242 | Γλ 243 | Γν.Ν.Σ.Κρ 244 | Γνωμ 245 | Γν 246 | Γράμμ 247 | Γρηγ.Ναζ 248 | Γρηγ.Νύσ 249 | Γ Νοσ 250 | Γ' Ογκολ 251 | Γ.Ν 252 | Δ΄Βασ 253 | Δ.Β 254 | Δ.Δίκη 255 | Δ.Δίκ 256 | Δ.Ε.Σ 257 | Δ.Ε.Φ.Α 258 | Δ.Ε.Φ 259 | Δ.Εργ.Ν 260 | Δαμ 261 | Δαμ.μνημ.έργ 262 | Δαν 263 | Δασ.Κ 264 | Δεκ 265 | Δελτ.Δικ.Ε.Τ.Ε 266 | Δελτ.Νομ 267 | Δελτ.Συνδ.Α.Ε 268 | Δερμ 269 | Δευτ 270 | Δεύτ 271 | Δημοσθ 272 | Δημόκρ 273 | Δι.Δικ 274 | Διάτ 275 | Διαιτ.Απ 276 | Διαιτ 277 | Διαρκ.Στρατ 278 | Δικ 279 | Διοίκ.Πρωτ 280 | ΔιοικΔνη 281 | Διοικ.Εφ 282 | Διον.Αρ 283 | Διόρθ.Λαθ 284 | Δ.κ.Π 285 | Δνη 286 | Δν 287 | Δογμ.Όρος 288 | Δρ 289 | Δ.τ.Α 290 | Δτ 291 | ΔωδΝομ 292 | Δ.Περ 293 | Δ.Στρ 294 | ΕΔΠολ 295 | ΕΕυρΚ 296 | ΕΙΣ 297 | ΕΝαυτΔ 298 | ΕΣΑμΕΑ 299 | ΕΣΘ 300 | ΕΣυγκΔ 301 | ΕΤρΑξΧρΔ 302 | Ε.Φ.Ε.Τ 303 | Ε.Φ.Ι 304 | Ε.Φ.Ο.Επ.Α 305 | Εβδ 306 | Εβρ 307 | Εγκύκλ.Επιστ 308 | Εγκ 309 | Εε.Αιγ 310 | Εθν.Κ.Τ 311 | Εθν 312 | Ειδ.Δικ.Αγ.Κακ 313 | Εικ 314 | Ειρ.Αθ 315 | Ειρην.Αθ 316 | Ειρην 317 | Έλεγχ 318 | Ειρ 319 | Εισ.Α.Π 320 | Εισ.Ε 321 | Εισ.Ν.Α.Κ 322 | Εισ.Ν.Κ.Πολ.Δ 323 | Εισ.Πρωτ 324 | Εισηγ.Έκθ 325 | Εισ 326 | Εκκλ 327 | Εκκ 328 | Εκ 329 | Ελλ.Δνη 330 | Εν.Ε 331 | Εξ 332 | Επ.Αν 333 | Επ.Εργ.Δ 334 | Επ.Εφ 335 | Επ.Κυπ.Δ 336 | Επ.Μεσ.Αρχ 337 | Επ.Νομ 338 | Επίκτ 339 | Επίκ 340 | Επι.Δ.Ε 341 | Επιθ.Ναυτ.Δικ 342 | Επικ 343 | Επισκ.Ε.Δ 344 | Επισκ.Εμπ.Δικ 345 | Επιστ.Επετ.Αρμ 346 | Επιστ.Επετ 347 | Επιστ.Ιερ 348 | Επιτρ.Προστ.Συνδ.Στελ 349 | Επιφάν 350 | Επτ.Εφ 351 | Επ.Ιρ 352 | Επ.Ι 353 | Εργ.Ασφ.Νομ 354 | Ερμ.Α.Κ 355 | Ερμη.Σ 356 | Εσθ 357 | Εσπερ 358 | Ετρ.Δ 359 | Ευκλ 360 | Ευρ.Δ.Δ.Α 361 | Ευρ.Σ.Δ.Α 362 | Ευρ.ΣτΕ 363 | Ευρατόμ 364 | Ευρ.Άλκ 365 | Ευρ.Ανδρομ 366 | Ευρ.Βάκχ 367 | Ευρ.Εκ 368 | Ευρ.Ελ 369 | Ευρ.Ηλ 370 | Ευρ.Ηρακ 371 | Ευρ.Ηρ 372 | Ευρ.Ηρ.Μαιν 373 | Ευρ.Ικέτ 374 | Ευρ.Ιππόλ 375 | Ευρ.Ιφ.Α 376 | Ευρ.Ιφ.Τ 377 | Ευρ.Ι.Τ 378 | Ευρ.Κύκλ 379 | Ευρ.Μήδ 380 | Ευρ.Ορ 381 | Ευρ.Ρήσ 382 | Ευρ.Τρωάδ 383 | Ευρ.Φοίν 384 | Εφ.Αθ 385 | Εφ.Εν 386 | Εφ.Επ 387 | Εφ.Θρ 388 | Εφ.Θ 389 | Εφ.Ι 390 | Εφ.Κερ 391 | Εφ.Κρ 392 | Εφ.Λ 393 | Εφ.Ν 394 | Εφ.Πατ 395 | Εφ.Πειρ 396 | Εφαρμ.Δ.Δ 397 | Εφαρμ 398 | Εφεσ 399 | Εφημ 400 | Εφ 401 | Ζαχ 402 | Ζιγ 403 | Ζυ 404 | Ζχ 405 | ΗΕ.Δ 406 | Ημερ 407 | Ηράκλ 408 | Ηροδ 409 | Ησίοδ 410 | Ησ 411 | Η.Ε.Γ 412 | ΘΗΣ 413 | ΘΡ 414 | Θαλ 415 | Θεοδ 416 | Θεοφ 417 | Θεσ 418 | Θεόδ.Μοψ 419 | Θεόκρ 420 | Θεόφιλ 421 | Θουκ 422 | Θρ 423 | Θρ.Ε 424 | Θρ.Ιερ 425 | Θρ.Ιρ 426 | Ιακ 427 | Ιαν 428 | Ιβ 429 | Ιδθ 430 | Ιδ 431 | Ιεζ 432 | Ιερ 433 | Ιζ 434 | Ιησ 435 | Ιησ.Ν 436 | Ικ 437 | Ιλ 438 | Ιν 439 | Ιουδ 440 | Ιουστ 441 | Ιούδα 442 | Ιούλ 443 | Ιούν 444 | Ιπποκρ 445 | Ιππόλ 446 | Ιρ 447 | Ισίδ.Πηλ 448 | Ισοκρ 449 | Ισ.Ν 450 | Ιωβ 451 | Ιωλ 452 | Ιων 453 | Ιω 454 | ΚΟΣ 455 | ΚΟ.ΜΕ.ΚΟΝ 456 | ΚΠοινΔ 457 | ΚΠολΔ 458 | ΚαΒ 459 | Καλ 460 | Καλ.Τέχν 461 | ΚανΒ 462 | Καν.Διαδ 463 | Κατάργ 464 | Κλ 465 | ΚοινΔ 466 | Κολσ 467 | Κολ 468 | Κον 469 | Κορ 470 | Κος 471 | ΚριτΕπιθ 472 | ΚριτΕ 473 | Κριτ 474 | Κρ 475 | ΚτΒ 476 | ΚτΕ 477 | ΚτΠ 478 | Κυβ 479 | Κυπρ 480 | Κύριλ.Αλεξ 481 | Κύριλ.Ιερ 482 | Λεβ 483 | Λεξ.Σουίδα 484 | Λευϊτ 485 | Λευ 486 | Λκ 487 | Λογ 488 | ΛουκΑμ 489 | Λουκιαν 490 | Λουκ.Έρωτ 491 | Λουκ.Ενάλ.Διάλ 492 | Λουκ.Ερμ 493 | Λουκ.Εταιρ.Διάλ 494 | Λουκ.Ε.Δ 495 | Λουκ.Θε.Δ 496 | Λουκ.Ικ. 497 | Λουκ.Ιππ 498 | Λουκ.Λεξιφ 499 | Λουκ.Μεν 500 | Λουκ.Μισθ.Συν 501 | Λουκ.Ορχ 502 | Λουκ.Περ 503 | Λουκ.Συρ 504 | Λουκ.Τοξ 505 | Λουκ.Τυρ 506 | Λουκ.Φιλοψ 507 | Λουκ.Φιλ 508 | Λουκ.Χάρ 509 | Λουκ. 510 | Λουκ.Αλ 511 | Λοχ 512 | Λυδ 513 | Λυκ 514 | Λυσ 515 | Λωζ 516 | Λ1 517 | Λ2 518 | ΜΟΕφ 519 | Μάρκ 520 | Μέν 521 | Μαλ 522 | Ματθ 523 | Μα 524 | Μιχ 525 | Μκ 526 | Μλ 527 | Μμ 528 | Μον.Δ.Π 529 | Μον.Πρωτ 530 | Μον 531 | Μρ 532 | Μτ 533 | Μχ 534 | Μ.Βασ 535 | Μ.Πλ 536 | ΝΑ 537 | Ναυτ.Χρον 538 | Να 539 | Νδικ 540 | Νεεμ 541 | Νε 542 | Νικ 543 | ΝκΦ 544 | Νμ 545 | ΝοΒ 546 | Νομ.Δελτ.Τρ.Ελ 547 | Νομ.Δελτ 548 | Νομ.Σ.Κ 549 | Νομ.Χρ 550 | Νομ 551 | Νομ.Διεύθ 552 | Νοσ 553 | Ντ 554 | Νόσων 555 | Ν1 556 | Ν2 557 | Ν3 558 | Ν4 559 | Νtot 560 | Ξενοφ 561 | Ξεν 562 | Ξεν.Ανάβ 563 | Ξεν.Απολ 564 | Ξεν.Απομν 565 | Ξεν.Απομ 566 | Ξεν.Ελλ 567 | Ξεν.Ιέρ 568 | Ξεν.Ιππαρχ 569 | Ξεν.Ιππ 570 | Ξεν.Κυρ.Αν 571 | Ξεν.Κύρ.Παιδ 572 | Ξεν.Κ.Π 573 | Ξεν.Λακ.Πολ 574 | Ξεν.Οικ 575 | Ξεν.Προσ 576 | Ξεν.Συμπόσ 577 | Ξεν.Συμπ 578 | Ο΄ 579 | Οβδ 580 | Οβ 581 | ΟικΕ 582 | Οικ 583 | Οικ.Πατρ 584 | Οικ.Σύν.Βατ 585 | Ολομ 586 | Ολ 587 | Ολ.Α.Π 588 | Ομ.Ιλ 589 | Ομ.Οδ 590 | ΟπΤοιχ 591 | Οράτ 592 | Ορθ 593 | ΠΡΟ.ΠΟ 594 | Πίνδ 595 | Πίνδ.Ι 596 | Πίνδ.Νεμ 597 | Πίνδ.Ν 598 | Πίνδ.Ολ 599 | Πίνδ.Παθ 600 | Πίνδ.Πυθ 601 | Πίνδ.Π 602 | ΠαγΝμλγ 603 | Παν 604 | Παρμ 605 | Παροιμ 606 | Παρ 607 | Παυσ 608 | Πειθ.Συμβ 609 | ΠειρΝ 610 | Πελ 611 | ΠεντΣτρ 612 | Πεντ 613 | Πεντ.Εφ 614 | ΠερΔικ 615 | Περ.Γεν.Νοσ 616 | Πετ 617 | Πλάτ 618 | Πλάτ.Αλκ 619 | Πλάτ.Αντ 620 | Πλάτ.Αξίοχ 621 | Πλάτ.Απόλ 622 | Πλάτ.Γοργ 623 | Πλάτ.Ευθ 624 | Πλάτ.Θεαίτ 625 | Πλάτ.Κρατ 626 | Πλάτ.Κριτ 627 | Πλάτ.Λύσ 628 | Πλάτ.Μεν 629 | Πλάτ.Νόμ 630 | Πλάτ.Πολιτ 631 | Πλάτ.Πολ 632 | Πλάτ.Πρωτ 633 | Πλάτ.Σοφ. 634 | Πλάτ.Συμπ 635 | Πλάτ.Τίμ 636 | Πλάτ.Φαίδρ 637 | Πλάτ.Φιλ 638 | Πλημ 639 | Πλούτ 640 | Πλούτ.Άρατ 641 | Πλούτ.Αιμ 642 | Πλούτ.Αλέξ 643 | Πλούτ.Αλκ 644 | Πλούτ.Αντ 645 | Πλούτ.Αρτ 646 | Πλούτ.Ηθ 647 | Πλούτ.Θεμ 648 | Πλούτ.Κάμ 649 | Πλούτ.Καίσ 650 | Πλούτ.Κικ 651 | Πλούτ.Κράσ 652 | Πλούτ.Κ 653 | Πλούτ.Λυκ 654 | Πλούτ.Μάρκ 655 | Πλούτ.Μάρ 656 | Πλούτ.Περ 657 | Πλούτ.Ρωμ 658 | Πλούτ.Σύλλ 659 | Πλούτ.Φλαμ 660 | Πλ 661 | Ποιν.Δικ 662 | Ποιν.Δ 663 | Ποιν.Ν 664 | Ποιν.Χρον 665 | Ποιν.Χρ 666 | Πολ.Δ 667 | Πολ.Πρωτ 668 | Πολ 669 | Πολ.Μηχ 670 | Πολ.Μ 671 | Πρακτ.Αναθ 672 | Πρακτ.Ολ 673 | Πραξ 674 | Πρμ 675 | Πρξ 676 | Πρωτ 677 | Πρ 678 | Πρ.Αν 679 | Πρ.Λογ 680 | Πταισμ 681 | Πυρ.Καλ 682 | Πόλη 683 | Π.Δ 684 | Π.Δ.Άσμ 685 | ΡΜ.Ε 686 | Ρθ 687 | Ρμ 688 | Ρωμ 689 | ΣΠλημ 690 | Σαπφ 691 | Σειρ 692 | Σολ 693 | Σοφ 694 | Σοφ.Αντιγ 695 | Σοφ.Αντ 696 | Σοφ.Αποσ 697 | Σοφ.Απ 698 | Σοφ.Ηλέκ 699 | Σοφ.Ηλ 700 | Σοφ.Οιδ.Κολ 701 | Σοφ.Οιδ.Τύρ 702 | Σοφ.Ο.Τ 703 | Σοφ.Σειρ 704 | Σοφ.Σολ 705 | Σοφ.Τραχ 706 | Σοφ.Φιλοκτ 707 | Σρ 708 | Σ.τ.Ε 709 | Σ.τ.Π 710 | Στρ.Π.Κ 711 | Στ.Ευρ 712 | Συζήτ 713 | Συλλ.Νομολ 714 | Συλ.Νομ 715 | ΣυμβΕπιθ 716 | Συμπ.Ν 717 | Συνθ.Αμ 718 | Συνθ.Ε.Ε 719 | Συνθ.Ε.Κ 720 | Συνθ.Ν 721 | Σφν 722 | Σφ 723 | Σφ.Σλ 724 | Σχ.Πολ.Δ 725 | Σχ.Συντ.Ε 726 | Σωσ 727 | Σύντ 728 | Σ.Πληρ 729 | ΤΘ 730 | ΤΣ.Δ 731 | Τίτ 732 | Τβ 733 | Τελ.Ενημ 734 | Τελ.Κ 735 | Τερτυλ 736 | Τιμ 737 | Τοπ.Α 738 | Τρ.Ο 739 | Τριμ 740 | Τριμ.Πλ 741 | Τρ.Πλημ 742 | Τρ.Π.Δ 743 | Τ.τ.Ε 744 | Ττ 745 | Τωβ 746 | Υγ 747 | Υπερ 748 | Υπ 749 | Υ.Γ 750 | Φιλήμ 751 | Φιλιπ 752 | Φιλ 753 | Φλμ 754 | Φλ 755 | Φορ.Β 756 | Φορ.Δ.Ε 757 | Φορ.Δνη 758 | Φορ.Δ 759 | Φορ.Επ 760 | Φώτ 761 | Χρ.Ι.Δ 762 | Χρ.Ιδ.Δ 763 | Χρ.Ο 764 | Χρυσ 765 | Ψήφ 766 | Ψαλμ 767 | Ψαλ 768 | Ψλ 769 | Ωριγ 770 | Ωσ 771 | Ω.Ρ.Λ 772 | άγν 773 | άγν.ετυμολ 774 | άγ 775 | άκλ 776 | άνθρ 777 | άπ 778 | άρθρ 779 | άρν 780 | άρ 781 | άτ 782 | άψ 783 | ά 784 | έκδ 785 | έκφρ 786 | έμψ 787 | ένθ.αν 788 | έτ 789 | έ.α 790 | ίδ 791 | αβεστ 792 | αβησσ 793 | αγγλ 794 | αγγ 795 | αδημ 796 | αεροναυτ 797 | αερον 798 | αεροπ 799 | αθλητ 800 | αθλ 801 | αθροιστ 802 | αιγυπτ 803 | αιγ 804 | αιτιολ 805 | αιτ 806 | αι 807 | ακαδ 808 | ακκαδ 809 | αλβ 810 | αλλ 811 | αλφαβητ 812 | αμα 813 | αμερικ 814 | αμερ 815 | αμετάβ 816 | αμτβ 817 | αμφιβ 818 | αμφισβ 819 | αμφ 820 | αμ 821 | ανάλ 822 | ανάπτ 823 | ανάτ 824 | αναβ 825 | αναδαν 826 | αναδιπλασ 827 | αναδιπλ 828 | αναδρ 829 | αναλ 830 | αναν 831 | ανασυλλ 832 | ανατολ 833 | ανατομ 834 | ανατυπ 835 | ανατ 836 | αναφορ 837 | αναφ 838 | ανα.ε 839 | ανδρων 840 | ανθρωπολ 841 | ανθρωπ 842 | ανθ 843 | ανομ 844 | αντίτ 845 | αντδ 846 | αντιγρ 847 | αντιθ 848 | αντικ 849 | αντιμετάθ 850 | αντων 851 | αντ 852 | ανωτ 853 | ανόργ 854 | ανών 855 | αορ 856 | απαρέμφ 857 | απαρφ 858 | απαρχ 859 | απαρ 860 | απλολ 861 | απλοπ 862 | αποβ 863 | αποηχηροπ 864 | αποθ 865 | αποκρυφ 866 | αποφ 867 | απρμφ 868 | απρφ 869 | απρόσ 870 | απόδ 871 | απόλ 872 | απόσπ 873 | απόφ 874 | αραβοτουρκ 875 | αραβ 876 | αραμ 877 | αρβαν 878 | αργκ 879 | αριθμτ 880 | αριθμ 881 | αριθ 882 | αρκτικόλ 883 | αρκ 884 | αρμεν 885 | αρμ 886 | αρνητ 887 | αρσ 888 | αρχαιολ 889 | αρχιτεκτ 890 | αρχιτ 891 | αρχκ 892 | αρχ 893 | αρωμουν 894 | αρωμ 895 | αρ 896 | αρ.μετρ 897 | αρ.φ 898 | ασσυρ 899 | αστρολ 900 | αστροναυτ 901 | αστρον 902 | αττ 903 | αυστραλ 904 | αυτοπ 905 | αυτ 906 | αφγαν 907 | αφηρ 908 | αφομ 909 | αφρικ 910 | αχώρ 911 | αόρ 912 | α.α 913 | α/α 914 | α0 915 | βαθμ 916 | βαθ 917 | βαπτ 918 | βασκ 919 | βεβαιωτ 920 | βεβ 921 | βεδ 922 | βενετ 923 | βεν 924 | βερβερ 925 | βιβλγρ 926 | βιολ 927 | βιομ 928 | βιοχημ 929 | βιοχ 930 | βλάχ 931 | βλ 932 | βλ.λ 933 | βοταν 934 | βοτ 935 | βουλγαρ 936 | βουλγ 937 | βούλ 938 | βραζιλ 939 | βρετον 940 | βόρ 941 | γαλλ 942 | γενικότ 943 | γενοβ 944 | γεν 945 | γερμαν 946 | γερμ 947 | γεωγρ 948 | γεωλ 949 | γεωμετρ 950 | γεωμ 951 | γεωπ 952 | γεωργ 953 | γλυπτ 954 | γλωσσολ 955 | γλωσσ 956 | γλ 957 | γνμδ 958 | γνμ 959 | γνωμ 960 | γοτθ 961 | γραμμ 962 | γραμ 963 | γρμ 964 | γρ 965 | γυμν 966 | δίδες 967 | δίκ 968 | δίφθ 969 | δαν 970 | δεικτ 971 | δεκατ 972 | δηλ 973 | δημογρ 974 | δημοτ 975 | δημώδ 976 | δημ 977 | διάγρ 978 | διάκρ 979 | διάλεξ 980 | διάλ 981 | διάσπ 982 | διαλεκτ 983 | διατρ 984 | διαφ 985 | διαχ 986 | διδα 987 | διεθν 988 | διεθ 989 | δικον 990 | διστ 991 | δισύλλ 992 | δισ 993 | διφθογγοπ 994 | δογμ 995 | δολ 996 | δοτ 997 | δρμ 998 | δρχ 999 | δρ(α) 1000 | δωρ 1001 | δ 1002 | εβρ 1003 | εγκλπ 1004 | εδ 1005 | εθνολ 1006 | εθν 1007 | ειδικότ 1008 | ειδ 1009 | ειδ.β 1010 | εικ 1011 | ειρ 1012 | εισ 1013 | εκατοστμ 1014 | εκατοστ 1015 | εκατστ.2 1016 | εκατστ.3 1017 | εκατ 1018 | εκδ 1019 | εκκλησ 1020 | εκκλ 1021 | εκ 1022 | ελλην 1023 | ελλ 1024 | ελνστ 1025 | ελπ 1026 | εμβ 1027 | εμφ 1028 | εναλλ 1029 | ενδ 1030 | ενεργ 1031 | ενεστ 1032 | ενικ 1033 | ενν 1034 | εν 1035 | εξέλ 1036 | εξακολ 1037 | εξομάλ 1038 | εξ 1039 | εο 1040 | επέκτ 1041 | επίδρ 1042 | επίθ 1043 | επίρρ 1044 | επίσ 1045 | επαγγελμ 1046 | επανάλ 1047 | επανέκδ 1048 | επιθ 1049 | επικ 1050 | επιμ 1051 | επιρρ 1052 | επιστ 1053 | επιτατ 1054 | επιφ 1055 | επών 1056 | επ 1057 | εργ 1058 | ερμ 1059 | ερρινοπ 1060 | ερωτ 1061 | ετρουσκ 1062 | ετυμ 1063 | ετ 1064 | ευφ 1065 | ευχετ 1066 | εφ 1067 | εύχρ 1068 | ε.α 1069 | ε/υ 1070 | ε0 1071 | ζωγρ 1072 | ζωολ 1073 | ηθικ 1074 | ηθ 1075 | ηλεκτρολ 1076 | ηλεκτρον 1077 | ηλεκτρ 1078 | ημίτ 1079 | ημίφ 1080 | ημιφ 1081 | ηχηροπ 1082 | ηχηρ 1083 | ηχομιμ 1084 | ηχ 1085 | η 1086 | θέατρ 1087 | θεολ 1088 | θετ 1089 | θηλ 1090 | θρακ 1091 | θρησκειολ 1092 | θρησκ 1093 | θ 1094 | ιαπων 1095 | ιατρ 1096 | ιδιωμ 1097 | ιδ 1098 | ινδ 1099 | ιραν 1100 | ισπαν 1101 | ιστορ 1102 | ιστ 1103 | ισχυροπ 1104 | ιταλ 1105 | ιχθυολ 1106 | ιων 1107 | κάτ 1108 | καθ 1109 | κακοσ 1110 | καν 1111 | καρ 1112 | κατάλ 1113 | κατατ 1114 | κατωτ 1115 | κατ 1116 | κα 1117 | κελτ 1118 | κεφ 1119 | κινεζ 1120 | κινημ 1121 | κλητ 1122 | κλιτ 1123 | κλπ 1124 | κλ 1125 | κν 1126 | κοινωνιολ 1127 | κοινων 1128 | κοπτ 1129 | κουτσοβλαχ 1130 | κουτσοβλ 1131 | κπ 1132 | κρ.γν 1133 | κτγ 1134 | κτην 1135 | κτητ 1136 | κτλ 1137 | κτ 1138 | κυριολ 1139 | κυρ 1140 | κύρ 1141 | κ 1142 | κ.ά 1143 | κ.ά.π 1144 | κ.α 1145 | κ.εξ 1146 | κ.επ 1147 | κ.ε 1148 | κ.λπ 1149 | κ.λ.π 1150 | κ.ού.κ 1151 | κ.ο.κ 1152 | κ.τ.λ 1153 | κ.τ.τ 1154 | κ.τ.ό 1155 | λέξ 1156 | λαογρ 1157 | λαπ 1158 | λατιν 1159 | λατ 1160 | λαϊκότρ 1161 | λαϊκ 1162 | λετ 1163 | λιθ 1164 | λογιστ 1165 | λογοτ 1166 | λογ 1167 | λουβ 1168 | λυδ 1169 | λόγ 1170 | λ 1171 | λ.χ 1172 | μέλλ 1173 | μέσ 1174 | μαθημ 1175 | μαθ 1176 | μαιευτ 1177 | μαλαισ 1178 | μαλτ 1179 | μαμμων 1180 | μεγεθ 1181 | μεε 1182 | μειωτ 1183 | μελ 1184 | μεξ 1185 | μεσν 1186 | μεσογ 1187 | μεσοπαθ 1188 | μεσοφ 1189 | μετάθ 1190 | μεταβτ 1191 | μεταβ 1192 | μετακ 1193 | μεταπλ 1194 | μεταπτωτ 1195 | μεταρ 1196 | μεταφορ 1197 | μετβ 1198 | μετεπιθ 1199 | μετεπιρρ 1200 | μετεωρολ 1201 | μετεωρ 1202 | μετον 1203 | μετουσ 1204 | μετοχ 1205 | μετρ 1206 | μετ 1207 | μητρων 1208 | μηχανολ 1209 | μηχ 1210 | μικροβιολ 1211 | μογγολ 1212 | μορφολ 1213 | μουσ 1214 | μπενελούξ 1215 | μσνλατ 1216 | μσν 1217 | μτβ 1218 | μτγν 1219 | μτγ 1220 | μτφρδ 1221 | μτφρ 1222 | μτφ 1223 | μτχ 1224 | μυθ 1225 | μυκην 1226 | μυκ 1227 | μφ 1228 | μ 1229 | μ.ε 1230 | μ.μ 1231 | μ.π.ε 1232 | μ.π.π 1233 | μ0 1234 | ναυτ 1235 | νεοελλ 1236 | νεολατιν 1237 | νεολατ 1238 | νεολ 1239 | νεότ 1240 | νλατ 1241 | νομ 1242 | νορβ 1243 | νοσ 1244 | νότ 1245 | ν 1246 | ξ.λ 1247 | οικοδ 1248 | οικολ 1249 | οικον 1250 | οικ 1251 | ολλανδ 1252 | ολλ 1253 | ομηρ 1254 | ομόρρ 1255 | ονομ 1256 | ον 1257 | οπτ 1258 | ορθογρ 1259 | ορθ 1260 | οριστ 1261 | ορυκτολ 1262 | ορυκτ 1263 | ορ 1264 | οσετ 1265 | οσκ 1266 | ουαλ 1267 | ουγγρ 1268 | ουδ 1269 | ουσιαστικοπ 1270 | ουσιαστ 1271 | ουσ 1272 | πίν 1273 | παθητ 1274 | παθολ 1275 | παθ 1276 | παιδ 1277 | παλαιοντ 1278 | παλαιότ 1279 | παλ 1280 | παππων 1281 | παράγρ 1282 | παράγ 1283 | παράλλ 1284 | παράλ 1285 | παραγ 1286 | παρακ 1287 | παραλ 1288 | παραπ 1289 | παρατ 1290 | παρβ 1291 | παρετυμ 1292 | παροξ 1293 | παρων 1294 | παρωχ 1295 | παρ 1296 | παρ.φρ 1297 | πατριδων 1298 | πατρων 1299 | πβ 1300 | περιθ 1301 | περιλ 1302 | περιφρ 1303 | περσ 1304 | περ 1305 | πιθ 1306 | πληθ 1307 | πληροφ 1308 | ποδ 1309 | ποιητ 1310 | πολιτ 1311 | πολλαπλ 1312 | πολ 1313 | πορτογαλ 1314 | πορτ 1315 | ποσ 1316 | πρακριτ 1317 | πρβλ 1318 | πρβ 1319 | πργ 1320 | πρκμ 1321 | πρκ 1322 | πρλ 1323 | προέλ 1324 | προβηγκ 1325 | προελλ 1326 | προηγ 1327 | προθεμ 1328 | προπαραλ 1329 | προπαροξ 1330 | προπερισπ 1331 | προσαρμ 1332 | προσηγορ 1333 | προσταχτ 1334 | προστ 1335 | προσφών 1336 | προσ 1337 | προτακτ 1338 | προτ.Εισ 1339 | προφ 1340 | προχωρ 1341 | πρτ 1342 | πρόθ 1343 | πρόσθ 1344 | πρόσ 1345 | πρότ 1346 | πρ 1347 | πρ.Εφ 1348 | πτ 1349 | πυ 1350 | π 1351 | π.Χ 1352 | π.μ 1353 | π.χ 1354 | ρήμ 1355 | ρίζ 1356 | ρηματ 1357 | ρητορ 1358 | ριν 1359 | ρουμ 1360 | ρωμ 1361 | ρωσ 1362 | ρ 1363 | σανσκρ 1364 | σαξ 1365 | σελ 1366 | σερβοκρ 1367 | σερβ 1368 | σημασιολ 1369 | σημδ 1370 | σημειολ 1371 | σημερ 1372 | σημιτ 1373 | σημ 1374 | σκανδ 1375 | σκυθ 1376 | σκωπτ 1377 | σλαβ 1378 | σλοβ 1379 | σουηδ 1380 | σουμερ 1381 | σουπ 1382 | σπάν 1383 | σπανιότ 1384 | σπ 1385 | σσ 1386 | στατ 1387 | στερ 1388 | στιγμ 1389 | στιχ 1390 | στρέμ 1391 | στρατιωτ 1392 | στρατ 1393 | στ 1394 | συγγ 1395 | συγκρ 1396 | συγκ 1397 | συμπερ 1398 | συμπλεκτ 1399 | συμπλ 1400 | συμπροφ 1401 | συμφυρ 1402 | συμφ 1403 | συνήθ 1404 | συνίζ 1405 | συναίρ 1406 | συναισθ 1407 | συνδετ 1408 | συνδ 1409 | συνεκδ 1410 | συνηρ 1411 | συνθετ 1412 | συνθ 1413 | συνοπτ 1414 | συντελ 1415 | συντομογρ 1416 | συντ 1417 | συν 1418 | συρ 1419 | σχημ 1420 | σχ 1421 | σύγκρ 1422 | σύμπλ 1423 | σύμφ 1424 | σύνδ 1425 | σύνθ 1426 | σύντμ 1427 | σύντ 1428 | σ 1429 | σ.π 1430 | σ/β 1431 | τακτ 1432 | τελ 1433 | τετρ 1434 | τετρ.μ 1435 | τεχνλ 1436 | τεχνολ 1437 | τεχν 1438 | τεύχ 1439 | τηλεπικ 1440 | τηλεόρ 1441 | τιμ 1442 | τιμ.τομ 1443 | τοΣ 1444 | τον 1445 | τοπογρ 1446 | τοπων 1447 | τοπ 1448 | τοσκ 1449 | τουρκ 1450 | τοχ 1451 | τριτοπρόσ 1452 | τροποπ 1453 | τροπ 1454 | τσεχ 1455 | τσιγγ 1456 | ττ 1457 | τυπ 1458 | τόμ 1459 | τόνν 1460 | τ 1461 | τ.μ 1462 | τ.χλμ 1463 | υβρ 1464 | υπερθ 1465 | υπερσ 1466 | υπερ 1467 | υπεύθ 1468 | υποθ 1469 | υποκορ 1470 | υποκ 1471 | υποσημ 1472 | υποτ 1473 | υποφ 1474 | υποχωρ 1475 | υπόλ 1476 | υπόχρ 1477 | υπ 1478 | υστλατ 1479 | υψόμ 1480 | υψ 1481 | φάκ 1482 | φαρμακολ 1483 | φαρμ 1484 | φιλολ 1485 | φιλοσ 1486 | φιλοτ 1487 | φινλ 1488 | φοινικ 1489 | φράγκ 1490 | φρανκον 1491 | φριζ 1492 | φρ 1493 | φυλλ 1494 | φυσιολ 1495 | φυσ 1496 | φωνηεντ 1497 | φωνητ 1498 | φωνολ 1499 | φων 1500 | φωτογρ 1501 | φ 1502 | φ.τ.μ 1503 | χαμιτ 1504 | χαρτόσ 1505 | χαρτ 1506 | χασμ 1507 | χαϊδ 1508 | χγφ 1509 | χειλ 1510 | χεττ 1511 | χημ 1512 | χιλ 1513 | χλγρ 1514 | χλγ 1515 | χλμ 1516 | χλμ.2 1517 | χλμ.3 1518 | χλσγρ 1519 | χλστγρ 1520 | χλστμ 1521 | χλστμ.2 1522 | χλστμ.3 1523 | χλ 1524 | χργρ 1525 | χρημ 1526 | χρον 1527 | χρ 1528 | χφ 1529 | χ.ε 1530 | χ.κ 1531 | χ.ο 1532 | χ.σ 1533 | χ.τ 1534 | χ.χ 1535 | ψευδ 1536 | ψυχαν 1537 | ψυχιατρ 1538 | ψυχολ 1539 | ψυχ 1540 | ωκεαν 1541 | όμ 1542 | όν 1543 | όπ.παρ 1544 | όπ.π 1545 | ό.π 1546 | ύψ 1547 | 1Βσ 1548 | 1Εσ 1549 | 1Θσ 1550 | 1Ιν 1551 | 1Κρ 1552 | 1Μκ 1553 | 1Πρ 1554 | 1Πτ 1555 | 1Τμ 1556 | 2Βσ 1557 | 2Εσ 1558 | 2Θσ 1559 | 2Ιν 1560 | 2Κρ 1561 | 2Μκ 1562 | 2Πρ 1563 | 2Πτ 1564 | 2Τμ 1565 | 3Βσ 1566 | 3Ιν 1567 | 3Μκ 1568 | 4Βσ 1569 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /data/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script preprocesses bitext with Byte Pair Encoding for NMT. 3 | # Executes the following steps: 4 | # 1. Tokenize source and target side of bitext 5 | # 2. Learn BPE-codes for both source and target side 6 | # 3. Encode source and target side using the codes learned 7 | # 4. Shuffle bitext for SGD 8 | # 5. Build source and target dictionaries 9 | 10 | if [ "$#" -ne 4 ]; then 11 | echo "" 12 | echo "Usage: $0 src trg path_to_data path_to_subword" 13 | echo "" 14 | exit 1 15 | fi 16 | 17 | if [ -z $PYTHON ]; then 18 | if [ -n `which python3` ]; then 19 | export PYTHON=python3 20 | else 21 | if [ -n `which python2`]; then 22 | export PYTHON=python2 23 | else 24 | if [ -n `which python`]; then 25 | export PYTHON=python 26 | fi 27 | fi 28 | fi 29 | fi 30 | 31 | if [ -z $PYTHON ]; then 32 | echo "Please set PYTHON to a Python interpreter" 33 | exit 1 34 | fi 35 | 36 | echo "Using $PYTHON" 37 | 38 | # number of merge ops (codes) for bpe 39 | SRC_CODE_SIZE=20000 40 | TRG_CODE_SIZE=20000 41 | 42 | # source language (example: fr) 43 | S=$1 44 | # target language (example: en) 45 | T=$2 46 | 47 | # path to dl4mt/data 48 | P1=$3 49 | 50 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt) 51 | P2=$4 52 | 53 | 54 | # merge all parallel corpora 55 | ./merge.sh $1 $2 $3 56 | 57 | # tokenize training and validation data 58 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/all_${S}-${T}.${S} > ${P1}/all_${S}-${T}.${S}.tok 59 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/all_${S}-${T}.${T} > ${P1}/all_${S}-${T}.${T}.tok 60 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/test2011/newstest2011.${S} > ${P1}/newstest2011.${S}.tok 61 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/test2011/newstest2011.${T} > ${P1}/newstest2011.${T}.tok 62 | 63 | # BPE 64 | if [ ! -f "${S}.bpe" ]; then 65 | $PYTHON $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${S}.tok > ${S}.bpe 66 | fi 67 | if [ ! -f "${T}.bpe" ]; then 68 | $PYTHON $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${T}.tok > ${T}.bpe 69 | fi 70 | 71 | # utility function to encode a file with bpe 72 | encode () { 73 | if [ ! -f "$3" ]; then 74 | $PYTHON $P2/apply_bpe.py -c $1 < $2 > $3 75 | else 76 | echo "$3 exists, pass" 77 | fi 78 | } 79 | 80 | # apply bpe to training data 81 | encode ${S}.bpe ${P1}/all_${S}-${T}.${S}.tok ${P1}/all_${S}-${T}.${S}.tok.bpe 82 | encode ${T}.bpe ${P1}/all_${S}-${T}.${T}.tok ${P1}/all_${S}-${T}.${T}.tok.bpe 83 | encode ${S}.bpe ${P1}/newstest2011.${S}.tok ${P1}/newstest2011.${S}.tok.bpe 84 | encode ${T}.bpe ${P1}/newstest2011.${T}.tok ${P1}/newstest2011.${T}.tok.bpe 85 | 86 | # shuffle 87 | $PYTHON $P1/shuffle.py all_${S}-${T}.${S}.tok.bpe all_${S}-${T}.${T}.tok.bpe 88 | 89 | # build dictionary 90 | $PYTHON $P1/build_dictionary.py all_${S}-${T}.${S}.tok.bpe 91 | $PYTHON $P1/build_dictionary.py all_${S}-${T}.${T}.tok.bpe 92 | 93 | -------------------------------------------------------------------------------- /data/scan_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy 4 | import theano 5 | 6 | from theano import tensor 7 | 8 | 9 | # some numbers 10 | n_steps = 10 11 | n_samples = 5 12 | dim = 10 13 | input_dim = 20 14 | output_dim = 2 15 | 16 | 17 | # one step function that will be used by scan 18 | def oneStep(x_t, h_tm1, W_x, W_h, W_o): 19 | 20 | h_t = tensor.tanh(tensor.dot(x_t, W_x) + 21 | tensor.dot(h_tm1, W_h)) 22 | o_t = tensor.dot(h_t, W_o) 23 | 24 | return h_t, o_t 25 | 26 | # spawn theano tensor variable, our symbolic input 27 | # a 3D tensor (n_steps, n_samples, dim) 28 | x = tensor.tensor3(dtype='float32') 29 | 30 | # initial state of our rnn 31 | init_state = tensor.alloc(0., n_samples, dim) 32 | 33 | # create parameters that we will use, 34 | # note that, parameters are theano shared variables 35 | 36 | # parameters for input to hidden states 37 | W_x_ = numpy.random.randn(input_dim, dim).astype('float32') 38 | W_x = theano.shared(W_x_) 39 | 40 | # parameters for hidden state transition 41 | W_h_ = numpy.random.randn(dim, dim).astype('float32') 42 | W_h = theano.shared(W_h_) 43 | 44 | # parameters from hidden state to output 45 | W_o_ = numpy.random.randn(dim, output_dim).astype('float32') 46 | W_o = theano.shared(W_o_) 47 | 48 | # scan function 49 | ([h_vals, o_vals], updates) = theano.scan( 50 | fn=oneStep, 51 | sequences=[x], 52 | outputs_info=[init_state, None], 53 | non_sequences=[W_x, W_h, W_o], 54 | n_steps=n_steps, 55 | strict=True) 56 | 57 | # let us now compile a function to get the output 58 | f = theano.function([x], [h_vals, o_vals]) 59 | 60 | # now we will call the compiled function with actual input 61 | actual_input = numpy.random.randn( 62 | n_steps, n_samples, input_dim).astype('float32') 63 | h_vals_, o_vals_ = f(actual_input) 64 | 65 | # print the shapes 66 | print('shape of input :', actual_input.shape) 67 | print('shape of h_vals:', h_vals_.shape) 68 | print('shape of o_vals:', o_vals_.shape) 69 | -------------------------------------------------------------------------------- /data/setup_cluster_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up development and data environments for 3 | # fionn cluster, copy under your home directory and run. 4 | 5 | if [ -z $PYTHON ]; then 6 | if [ -n `which python3` ]; then 7 | export PYTHON=python3 8 | else 9 | if [ -n `which python2`]; then 10 | export PYTHON=python2 11 | else 12 | if [ -n `which python`]; then 13 | export PYTHON=python 14 | fi 15 | fi 16 | fi 17 | fi 18 | 19 | if [ -z $PYTHON ]; then 20 | echo "Please set PYTHON to a Python interpreter" 21 | exit 1 22 | fi 23 | 24 | echo "Using $PYTHON" 25 | 26 | # this file is for the dependencies 27 | LOCAL_INSTALL_FILE=/ichec/work/dl4mt_data/local_install.tgz 28 | 29 | # code directory for cloned repositories 30 | CODE_DIR=${HOME}/codes/dl4mt-material 31 | 32 | # code repository 33 | CODE_CENTRAL=https://github.com/nyu-dl/dl4mt-tutorial 34 | 35 | # reference files directory 36 | REF_DATA_DIR=/ichec/work/dl4mt_data/nec_files 37 | 38 | # our input files will reside here 39 | DATA_DIR=${HOME}/data 40 | 41 | # our trained models will be saved here 42 | MODELS_DIR=${HOME}/models 43 | 44 | # theano repository 45 | THEANO_GIT=https://github.com/Theano/Theano.git 46 | 47 | # theano install dir 48 | THEANO_DIR=${HOME}/repo/Theano 49 | 50 | # move to home directory 51 | cd 52 | 53 | # copy dependency file to your local and extract 54 | echo "Copying and extracting dependency file" 55 | rsync --bwlimit=20000 -Pavz ${LOCAL_INSTALL_FILE} ${HOME} 56 | tar zxvf ${HOME}/local_install.tgz 57 | 58 | # clone the repository from github into code directory 59 | echo "Cloning lab repository" 60 | if [ ! -d "${CODE_DIR}" ]; then 61 | mkdir -p ${CODE_DIR} 62 | fi 63 | git clone ${CODE_CENTRAL} ${CODE_DIR} 64 | 65 | # copy corpora, dictionaries etc for training and dev 66 | echo "Copying data" 67 | if [ ! -d "${DATA_DIR}" ]; then 68 | mkdir -p ${DATA_DIR} 69 | fi 70 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/all.* ${DATA_DIR} 71 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/news* ${DATA_DIR} 72 | 73 | # create model output directory if it does not exist 74 | if [ ! -d "${MODELS_DIR}" ]; then 75 | mkdir -p ${MODELS_DIR} 76 | fi 77 | 78 | # clone and install Theano 79 | echo "Cloning/installing Theano" 80 | mkdir -p ${THEANO_DIR} 81 | git clone ${THEANO_GIT} ${THEANO_DIR} 82 | cd ${THEANO_DIR} 83 | $PYTHON setup.py install --user 84 | 85 | # check if theano is working 86 | $PYTHON -c "from __future__ import print_function; import theano; print 'theano available!'" 87 | 88 | -------------------------------------------------------------------------------- /data/setup_local_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up development and data environments for 3 | # a local machine, copy under your home directory and run. 4 | # Note that, Theano is NOT installed by this script. 5 | # To use Byte Pair Encoding, simply pass -b argument. 6 | 7 | BPE=false 8 | 9 | while getopts ':b' flag; do 10 | case "${flag}" in 11 | b) BPE=true 12 | echo "Using Byte Pair Encoding" ;; 13 | *) error 14 | echo "" 15 | echo "Usage: $0 [-b]" 16 | echo "" 17 | exit 1 ;; 18 | esac 19 | done 20 | 21 | if [ -z $PYTHON ]; then 22 | if [ -n `which python3` ]; then 23 | export PYTHON=python3 24 | else 25 | if [ -n `which python2`]; then 26 | export PYTHON=python2 27 | else 28 | if [ -n `which python`]; then 29 | export PYTHON=python 30 | fi 31 | fi 32 | fi 33 | fi 34 | 35 | if [ -z $PYTHON ]; then 36 | echo "Please set PYTHON to a Python interpreter" 37 | exit 1 38 | fi 39 | 40 | echo "Using $PYTHON" 41 | 42 | # code directory for cloned repositories 43 | SCRIPT_DIR=$( dirname "${BASH_SOURCE[0]}" ) 44 | CODE_DIR=${SCRIPT_DIR}/.. 45 | 46 | # code repository 47 | CODE_CENTRAL=https://github.com/nyu-dl/dl4mt-tutorial 48 | 49 | # our input files will reside here 50 | DATA_DIR=${CODE_DIR}/data 51 | 52 | # our trained models will be saved here 53 | MODELS_DIR=${HOME}/models 54 | 55 | 56 | # clone the repository from github into code directory 57 | if [ ! -d "${CODE_DIR}" ]; then 58 | echo "Cloning central ..." 59 | mkdir -p ${CODE_DIR} 60 | git clone ${CODE_CENTRAL} ${CODE_DIR} 61 | fi 62 | 63 | # download the europarl v7 and validation sets and extract 64 | if [ ! -f ${DATA_DIR}/train_data.tgz ]; then 65 | curl -o ${DATA_DIR}/train_data.tgz http://www.statmt.org/europarl/v7/fr-en.tgz 66 | else 67 | echo "${DATA_DIR}/train_data.tgz exists" 68 | fi 69 | if [ ! -f ${DATA_DIR}/valid_data.tgz ]; then 70 | curl -o ${DATA_DIR}/valid_data.tgz http://matrix.statmt.org/test_sets/newstest2011.tgz 71 | else 72 | echo "${DATA_DIR}/valid_data.tgz exists" 73 | fi 74 | $PYTHON ${CODE_DIR}/data/extract_files.py \ 75 | -s='fr' -t='en' \ 76 | --source-dev=newstest2011.fr \ 77 | --target-dev=newstest2011.en \ 78 | --outdir=${DATA_DIR} 79 | 80 | if [ "$BPE" = true ] ; then 81 | 82 | BPE_DIR=${HOME}/codes/subword-nmt 83 | BPE_CENTRAL=https://github.com/rsennrich/subword-nmt 84 | 85 | # clone subword-nmt repository 86 | if [ ! -d "${BPE_DIR}" ]; then 87 | echo "Cloning BPE central ..." 88 | mkdir -p ${BPE_DIR} 89 | git clone ${BPE_CENTRAL} ${BPE_DIR} 90 | fi 91 | 92 | # follow the preprocessing pipeline for BPE 93 | ./preprocess.sh 'fr' 'en' ${DATA_DIR} ${BPE_DIR} 94 | 95 | else 96 | 97 | # tokenize corresponding files 98 | perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/test2011/newstest2011.fr > ${DATA_DIR}/newstest2011.fr.tok 99 | perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/test2011/newstest2011.en > ${DATA_DIR}/newstest2011.en.tok 100 | perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/europarl-v7.fr-en.fr > ${DATA_DIR}/europarl-v7.fr-en.fr.tok 101 | perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/europarl-v7.fr-en.en > ${DATA_DIR}/europarl-v7.fr-en.en.tok 102 | 103 | # extract dictionaries 104 | $PYTHON ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.fr.tok 105 | $PYTHON ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.en.tok 106 | 107 | # shuffle traning data 108 | $PYTHON ${CODE_DIR}/data/shuffle.py ${DATA_DIR}/europarl-v7.fr-en.en.tok ${DATA_DIR}/europarl-v7.fr-en.fr.tok 109 | fi 110 | 111 | # create model output directory if it does not exist 112 | if [ ! -d "${MODELS_DIR}" ]; then 113 | mkdir -p ${MODELS_DIR} 114 | fi 115 | 116 | # check if theano is working 117 | $PYTHON -c "from __future__ import print_function; import theano; print('theano available!')" 118 | -------------------------------------------------------------------------------- /data/shuffle.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import sys 5 | import random 6 | 7 | from tempfile import mkstemp 8 | from subprocess import call 9 | 10 | 11 | 12 | def main(files): 13 | 14 | tf_os, tpath = mkstemp() 15 | tf = open(tpath, 'w') 16 | 17 | fds = [open(ff) for ff in files] 18 | 19 | for l in fds[0]: 20 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 21 | print("|||".join(lines), file=tf) 22 | 23 | [ff.close() for ff in fds] 24 | tf.close() 25 | 26 | tf = open(tpath, 'r') 27 | lines = tf.readlines() 28 | random.shuffle(lines) 29 | 30 | fds = [open(ff+'.shuf','w') for ff in files] 31 | 32 | for l in lines: 33 | s = l.strip().split('|||') 34 | for ii, fd in enumerate(fds): 35 | print(s[ii], file=fd) 36 | 37 | [ff.close() for ff in fds] 38 | 39 | os.remove(tpath) 40 | 41 | if __name__ == '__main__': 42 | main(sys.argv[1:]) 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /data/strip_sgml.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | import re 5 | 6 | 7 | def main(): 8 | fin = sys.stdin 9 | fout = sys.stdout 10 | for l in fin: 11 | line = l.strip() 12 | text = re.sub('<[^<]+>', "", line).strip() 13 | if len(text) == 0: 14 | continue 15 | print(text, file=fout) 16 | 17 | 18 | if __name__ == "__main__": 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /data/tokenize_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for F in `ls ./training/* | grep -v pkl | grep -v tok` 4 | do 5 | echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok" 6 | perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok 7 | done 8 | 9 | for F in `ls ./dev/*.?? | grep -v tok` 10 | do 11 | echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok" 12 | perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok 13 | done 14 | -------------------------------------------------------------------------------- /data/tokenizer.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | 8 | # Sample Tokenizer 9 | ### Version 1.1 10 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn 11 | # Version 1.1 updates: 12 | # (1) add multithreading option "-threads NUM_THREADS" (default is 1); 13 | # (2) add a timing option "-time" to calculate the average speed of this tokenizer; 14 | # (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); 15 | ### Version 1.0 16 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ 17 | # written by Josh Schroeder, based on code by Philipp Koehn 18 | 19 | binmode(STDIN, ":utf8"); 20 | binmode(STDOUT, ":utf8"); 21 | 22 | use warnings; 23 | use FindBin qw($RealBin); 24 | use strict; 25 | use Time::HiRes; 26 | 27 | if (eval {require Thread;1;}) { 28 | #module loaded 29 | Thread->import(); 30 | } 31 | 32 | my $mydir = "$RealBin/nonbreaking_prefixes"; 33 | 34 | my %NONBREAKING_PREFIX = (); 35 | my @protected_patterns = (); 36 | my $protected_patterns_file = ""; 37 | my $language = "en"; 38 | my $QUIET = 0; 39 | my $HELP = 0; 40 | my $AGGRESSIVE = 0; 41 | my $SKIP_XML = 0; 42 | my $TIMING = 0; 43 | my $NUM_THREADS = 1; 44 | my $NUM_SENTENCES_PER_THREAD = 2000; 45 | my $PENN = 0; 46 | my $NO_ESCAPING = 0; 47 | while (@ARGV) 48 | { 49 | $_ = shift; 50 | /^-b$/ && ($| = 1, next); 51 | /^-l$/ && ($language = shift, next); 52 | /^-q$/ && ($QUIET = 1, next); 53 | /^-h$/ && ($HELP = 1, next); 54 | /^-x$/ && ($SKIP_XML = 1, next); 55 | /^-a$/ && ($AGGRESSIVE = 1, next); 56 | /^-time$/ && ($TIMING = 1, next); 57 | # Option to add list of regexps to be protected 58 | /^-protected/ && ($protected_patterns_file = shift, next); 59 | /^-threads$/ && ($NUM_THREADS = int(shift), next); 60 | /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); 61 | /^-penn$/ && ($PENN = 1, next); 62 | /^-no-escape/ && ($NO_ESCAPING = 1, next); 63 | } 64 | 65 | # for time calculation 66 | my $start_time; 67 | if ($TIMING) 68 | { 69 | $start_time = [ Time::HiRes::gettimeofday( ) ]; 70 | } 71 | 72 | # print help message 73 | if ($HELP) 74 | { 75 | print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; 76 | print "Options:\n"; 77 | print " -q ... quiet.\n"; 78 | print " -a ... aggressive hyphen splitting.\n"; 79 | print " -b ... disable Perl buffering.\n"; 80 | print " -time ... enable processing time calculation.\n"; 81 | print " -penn ... use Penn treebank-like tokenization.\n"; 82 | print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; 83 | print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; 84 | exit; 85 | } 86 | 87 | if (!$QUIET) 88 | { 89 | print STDERR "Tokenizer Version 1.1\n"; 90 | print STDERR "Language: $language\n"; 91 | print STDERR "Number of threads: $NUM_THREADS\n"; 92 | } 93 | 94 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes 95 | load_prefixes($language,\%NONBREAKING_PREFIX); 96 | 97 | if (scalar(%NONBREAKING_PREFIX) eq 0) 98 | { 99 | print STDERR "Warning: No known abbreviations for language '$language'\n"; 100 | } 101 | 102 | # Load protected patterns 103 | if ($protected_patterns_file) 104 | { 105 | open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file"; 106 | while() { 107 | chomp; 108 | push @protected_patterns, $_; 109 | } 110 | } 111 | 112 | my @batch_sentences = (); 113 | my @thread_list = (); 114 | my $count_sentences = 0; 115 | 116 | if ($NUM_THREADS > 1) 117 | {# multi-threading tokenization 118 | while() 119 | { 120 | $count_sentences = $count_sentences + 1; 121 | push(@batch_sentences, $_); 122 | if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) 123 | { 124 | # assign each thread work 125 | for (my $i=0; $i<$NUM_THREADS; $i++) 126 | { 127 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD; 128 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; 129 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; 130 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; 131 | push(@thread_list, $new_thread); 132 | } 133 | foreach (@thread_list) 134 | { 135 | my $tokenized_list = $_->join; 136 | foreach (@$tokenized_list) 137 | { 138 | print $_; 139 | } 140 | } 141 | # reset for the new run 142 | @thread_list = (); 143 | @batch_sentences = (); 144 | } 145 | } 146 | # the last batch 147 | if (scalar(@batch_sentences)>0) 148 | { 149 | # assign each thread work 150 | for (my $i=0; $i<$NUM_THREADS; $i++) 151 | { 152 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD; 153 | if ($start_index >= scalar(@batch_sentences)) 154 | { 155 | last; 156 | } 157 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; 158 | if ($end_index >= scalar(@batch_sentences)) 159 | { 160 | $end_index = scalar(@batch_sentences)-1; 161 | } 162 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; 163 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; 164 | push(@thread_list, $new_thread); 165 | } 166 | foreach (@thread_list) 167 | { 168 | my $tokenized_list = $_->join; 169 | foreach (@$tokenized_list) 170 | { 171 | print $_; 172 | } 173 | } 174 | } 175 | } 176 | else 177 | {# single thread only 178 | while() 179 | { 180 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 181 | { 182 | #don't try to tokenize XML/HTML tag lines 183 | print $_; 184 | } 185 | else 186 | { 187 | print &tokenize($_); 188 | } 189 | } 190 | } 191 | 192 | if ($TIMING) 193 | { 194 | my $duration = Time::HiRes::tv_interval( $start_time ); 195 | print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); 196 | print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); 197 | } 198 | 199 | ##################################################################################### 200 | # subroutines afterward 201 | 202 | # tokenize a batch of texts saved in an array 203 | # input: an array containing a batch of texts 204 | # return: another array containing a batch of tokenized texts for the input array 205 | sub tokenize_batch 206 | { 207 | my(@text_list) = @_; 208 | my(@tokenized_list) = (); 209 | foreach (@text_list) 210 | { 211 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 212 | { 213 | #don't try to tokenize XML/HTML tag lines 214 | push(@tokenized_list, $_); 215 | } 216 | else 217 | { 218 | push(@tokenized_list, &tokenize($_)); 219 | } 220 | } 221 | return \@tokenized_list; 222 | } 223 | 224 | # the actual tokenize function which tokenizes one input string 225 | # input: one string 226 | # return: the tokenized string for the input string 227 | sub tokenize 228 | { 229 | my($text) = @_; 230 | 231 | if ($PENN) { 232 | return tokenize_penn($text); 233 | } 234 | 235 | chomp($text); 236 | $text = " $text "; 237 | 238 | # remove ASCII junk 239 | $text =~ s/\s+/ /g; 240 | $text =~ s/[\000-\037]//g; 241 | 242 | # Find protected patterns 243 | my @protected = (); 244 | foreach my $protected_pattern (@protected_patterns) { 245 | my $t = $text; 246 | while ($t =~ /($protected_pattern)(.*)$/) { 247 | push @protected, $1; 248 | $t = $2; 249 | } 250 | } 251 | 252 | for (my $i = 0; $i < scalar(@protected); ++$i) { 253 | my $subst = sprintf("THISISPROTECTED%.3d", $i); 254 | $text =~ s,\Q$protected[$i], $subst ,g; 255 | } 256 | $text =~ s/ +/ /g; 257 | $text =~ s/^ //g; 258 | $text =~ s/ $//g; 259 | 260 | # seperate out all "other" special characters 261 | $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; 262 | 263 | # aggressive hyphen splitting 264 | if ($AGGRESSIVE) 265 | { 266 | $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g; 267 | } 268 | 269 | #multi-dots stay together 270 | $text =~ s/\.([\.]+)/ DOTMULTI$1/g; 271 | while($text =~ /DOTMULTI\./) 272 | { 273 | $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; 274 | $text =~ s/DOTMULTI\./DOTDOTMULTI/g; 275 | } 276 | 277 | # seperate out "," except if within numbers (5,300) 278 | #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 279 | 280 | # separate out "," except if within numbers (5,300) 281 | # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E 282 | # first application uses up B so rule can't see B,C 283 | # two-step version here may create extra spaces but these are removed later 284 | # will also space digit,letter or letter,digit forms (redundant with next section) 285 | $text =~ s/([^\p{IsN}])[,]/$1 , /g; 286 | $text =~ s/[,]([^\p{IsN}])/ , $1/g; 287 | 288 | # separate , pre and post number 289 | #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 290 | #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 291 | 292 | # turn `into ' 293 | #$text =~ s/\`/\'/g; 294 | 295 | #turn '' into " 296 | #$text =~ s/\'\'/ \" /g; 297 | 298 | if ($language eq "en") 299 | { 300 | #split contractions right 301 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 302 | $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; 303 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 304 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; 305 | #special case for "1990's" 306 | $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; 307 | } 308 | elsif (($language eq "fr") or ($language eq "it")) 309 | { 310 | #split contractions left 311 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 312 | $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; 313 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 314 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; 315 | } 316 | else 317 | { 318 | $text =~ s/\'/ \' /g; 319 | } 320 | 321 | #word token method 322 | my @words = split(/\s/,$text); 323 | $text = ""; 324 | for (my $i=0;$i<(scalar(@words));$i++) 325 | { 326 | my $word = $words[$i]; 327 | if ( $word =~ /^(\S+)\.$/) 328 | { 329 | my $pre = $1; 330 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml 371 | $text =~ s/\'/\'/g; # xml 372 | $text =~ s/\"/\"/g; # xml 373 | $text =~ s/\[/\[/g; # syntax non-terminal 374 | $text =~ s/\]/\]/g; # syntax non-terminal 375 | } 376 | 377 | #ensure final line break 378 | $text .= "\n" unless $text =~ /\n$/; 379 | 380 | return $text; 381 | } 382 | 383 | sub tokenize_penn 384 | { 385 | # Improved compatibility with Penn Treebank tokenization. Useful if 386 | # the text is to later be parsed with a PTB-trained parser. 387 | # 388 | # Adapted from Robert MacIntyre's sed script: 389 | # http://www.cis.upenn.edu/~treebank/tokenizer.sed 390 | 391 | my($text) = @_; 392 | chomp($text); 393 | 394 | # remove ASCII junk 395 | $text =~ s/\s+/ /g; 396 | $text =~ s/[\000-\037]//g; 397 | 398 | # attempt to get correct directional quotes 399 | $text =~ s/^``/`` /g; 400 | $text =~ s/^"/`` /g; 401 | $text =~ s/^`([^`])/` $1/g; 402 | $text =~ s/^'/` /g; 403 | $text =~ s/([ ([{<])"/$1 `` /g; 404 | $text =~ s/([ ([{<])``/$1 `` /g; 405 | $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; 406 | $text =~ s/([ ([{<])'/$1 ` /g; 407 | # close quotes handled at end 408 | 409 | $text =~ s=\.\.\.= _ELLIPSIS_ =g; 410 | 411 | # separate out "," except if within numbers (5,300) 412 | $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 413 | # separate , pre and post number 414 | $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 415 | $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 416 | 417 | #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; 418 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; 419 | 420 | # Separate out intra-token slashes. PTB tokenization doesn't do this, so 421 | # the tokens should be merged prior to parsing with a PTB-trained parser 422 | # (see syntax-hyphen-splitting.perl). 423 | $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; 424 | 425 | # Assume sentence tokenization has been done first, so split FINAL periods 426 | # only. 427 | $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; 428 | # however, we may as well split ALL question marks and exclamation points, 429 | # since they shouldn't have the abbrev.-marker ambiguity problem 430 | $text =~ s=([?!])= $1 =g; 431 | 432 | # parentheses, brackets, etc. 433 | $text =~ s=([\]\[\(\){}<>])= $1 =g; 434 | $text =~ s/\(/-LRB-/g; 435 | $text =~ s/\)/-RRB-/g; 436 | $text =~ s/\[/-LSB-/g; 437 | $text =~ s/\]/-RSB-/g; 438 | $text =~ s/{/-LCB-/g; 439 | $text =~ s/}/-RCB-/g; 440 | 441 | $text =~ s=--= -- =g; 442 | 443 | # First off, add a space to the beginning and end of each line, to reduce 444 | # necessary number of regexps. 445 | $text =~ s=$= =; 446 | $text =~ s=^= =; 447 | 448 | $text =~ s="= '' =g; 449 | # possessive or close-single-quote 450 | $text =~ s=([^'])' =$1 ' =g; 451 | # as in it's, I'm, we'd 452 | $text =~ s='([sSmMdD]) = '$1 =g; 453 | $text =~ s='ll = 'll =g; 454 | $text =~ s='re = 're =g; 455 | $text =~ s='ve = 've =g; 456 | $text =~ s=n't = n't =g; 457 | $text =~ s='LL = 'LL =g; 458 | $text =~ s='RE = 'RE =g; 459 | $text =~ s='VE = 'VE =g; 460 | $text =~ s=N'T = N'T =g; 461 | 462 | $text =~ s= ([Cc])annot = $1an not =g; 463 | $text =~ s= ([Dd])'ye = $1' ye =g; 464 | $text =~ s= ([Gg])imme = $1im me =g; 465 | $text =~ s= ([Gg])onna = $1on na =g; 466 | $text =~ s= ([Gg])otta = $1ot ta =g; 467 | $text =~ s= ([Ll])emme = $1em me =g; 468 | $text =~ s= ([Mm])ore'n = $1ore 'n =g; 469 | $text =~ s= '([Tt])is = '$1 is =g; 470 | $text =~ s= '([Tt])was = '$1 was =g; 471 | $text =~ s= ([Ww])anna = $1an na =g; 472 | 473 | #word token method 474 | my @words = split(/\s/,$text); 475 | $text = ""; 476 | for (my $i=0;$i<(scalar(@words));$i++) 477 | { 478 | my $word = $words[$i]; 479 | if ( $word =~ /^(\S+)\.$/) 480 | { 481 | my $pre = $1; 482 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml 511 | $text =~ s/\'/\'/g; # xml 512 | $text =~ s/\"/\"/g; # xml 513 | $text =~ s/\[/\[/g; # syntax non-terminal 514 | $text =~ s/\]/\]/g; # syntax non-terminal 515 | 516 | #ensure final line break 517 | $text .= "\n" unless $text =~ /\n$/; 518 | 519 | return $text; 520 | } 521 | 522 | sub load_prefixes 523 | { 524 | my ($language, $PREFIX_REF) = @_; 525 | 526 | my $prefixfile = "$mydir/nonbreaking_prefix.$language"; 527 | 528 | #default back to English if we don't have a language-specific prefix file 529 | if (!(-e $prefixfile)) 530 | { 531 | $prefixfile = "$mydir/nonbreaking_prefix.en"; 532 | print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; 533 | die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); 534 | } 535 | 536 | if (-e "$prefixfile") 537 | { 538 | open(PREFIX, "<:utf8", "$prefixfile"); 539 | while () 540 | { 541 | my $item = $_; 542 | chomp($item); 543 | if (($item) && (substr($item,0,1) ne "#")) 544 | { 545 | if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) 546 | { 547 | $PREFIX_REF->{$1} = 2; 548 | } 549 | else 550 | { 551 | $PREFIX_REF->{$item} = 1; 552 | } 553 | } 554 | } 555 | close(PREFIX); 556 | } 557 | } 558 | -------------------------------------------------------------------------------- /docs/cgru.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-tutorial/c476acbe325b8f2709d64649a052573219410fae/docs/cgru.pdf -------------------------------------------------------------------------------- /docs/cgru.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt, oneside]{article} 2 | \usepackage{geometry} 3 | %\geometry{letterpaper} 4 | \usepackage[parfill]{parskip} 5 | \usepackage{graphicx} 6 | \usepackage{amssymb} 7 | \usepackage{amsthm} 8 | \usepackage{mathtools} 9 | \usepackage{enumerate} 10 | 11 | \theoremstyle{definition} 12 | \newtheorem*{ans*}{Answer} 13 | 14 | \newcommand{\obs}{\text{obs}} 15 | \newcommand{\mis}{\text{mis}} 16 | 17 | \newcommand{\qt}[1]{\left<#1\right>} 18 | \newcommand{\ql}[1]{\left[#1\right]} 19 | \newcommand{\hess}{\mathbf{H}} 20 | \newcommand{\jacob}{\mathbf{J}} 21 | %\newcommand{\hl}{HL} 22 | \newcommand{\cost}{\mathcal{L}} 23 | \newcommand{\lout}{\mathbf{r}} 24 | \newcommand{\louti}{r} 25 | \newcommand{\outi}{y} 26 | \newcommand{\out}{\mathbf{y}} 27 | \newcommand{\gauss}{\mathbf{G_N}} 28 | \newcommand{\eye}{\mathbf{I}} 29 | \newcommand{\softmax}{\phi} 30 | \newcommand{\targ}{\mathbf{t}} 31 | \newcommand{\metric}{\mathbf{G}} 32 | \newcommand{\sample}{\mathbf{z}} 33 | \newcommand{\f}{\text{f}} 34 | %\newcommand{\log}{\text{log}} 35 | 36 | \newcommand{\bmx}[0]{\begin{bmatrix}} 37 | \newcommand{\emx}[0]{\end{bmatrix}} 38 | \newcommand{\qexp}[1]{\left<#1\right>} 39 | \newcommand{\vect}[1]{\mathbf{#1}} 40 | \newcommand{\vects}[1]{\boldsymbol{#1}} 41 | \newcommand{\matr}[1]{\mathbf{#1}} 42 | \newcommand{\var}[0]{\operatorname{Var}} 43 | \newcommand{\std}[0]{\operatorname{std}} 44 | \newcommand{\cov}[0]{\operatorname{Cov}} 45 | \newcommand{\diag}[0]{\operatorname{diag}} 46 | \newcommand{\matrs}[1]{\boldsymbol{#1}} 47 | \newcommand{\va}[0]{\vect{a}} 48 | \newcommand{\vb}[0]{\vect{b}} 49 | \newcommand{\vc}[0]{\vect{c}} 50 | \newcommand{\ve}[0]{\vect{e}} 51 | 52 | \newcommand{\vh}[0]{\vect{h}} 53 | \newcommand{\vv}[0]{\vect{v}} 54 | \newcommand{\vx}[0]{\vect{x}} 55 | \newcommand{\vz}[0]{\vect{z}} 56 | \newcommand{\vw}[0]{\vect{w}} 57 | \newcommand{\vs}[0]{\vect{s}} 58 | \newcommand{\vf}[0]{\vect{f}} 59 | \newcommand{\vi}[0]{\vect{i}} 60 | \newcommand{\vo}[0]{\vect{o}} 61 | \newcommand{\vy}[0]{\vect{y}} 62 | \newcommand{\vg}[0]{\vect{g}} 63 | \newcommand{\vm}[0]{\vect{m}} 64 | \newcommand{\vu}[0]{\vect{u}} 65 | \newcommand{\vL}[0]{\vect{L}} 66 | \newcommand{\vr}[0]{\vect{r}} 67 | \newcommand{\vp}[0]{\vect{p}} 68 | \newcommand{\mW}[0]{\matr{W}} 69 | \newcommand{\mP}[0]{\matr{P}} 70 | 71 | \newcommand{\mE}[0]{\matr{E}} 72 | \newcommand{\mG}[0]{\matr{G}} 73 | \newcommand{\mX}[0]{\matr{X}} 74 | \newcommand{\mQ}[0]{\matr{Q}} 75 | \newcommand{\mU}[0]{\matr{U}} 76 | \newcommand{\mF}[0]{\matr{F}} 77 | \newcommand{\mV}[0]{\matr{V}} 78 | \newcommand{\mA}{\matr{A}} 79 | \newcommand{\mC}{\matr{C}} 80 | \newcommand{\mD}{\matr{D}} 81 | \newcommand{\mS}{\matr{S}} 82 | \newcommand{\mI}{\matr{I}} 83 | \newcommand{\td}[0]{\text{d}} 84 | \newcommand{\TT}[0]{\vects{\theta}} 85 | \newcommand{\vsig}[0]{\vects{\sigma}} 86 | \newcommand{\valpha}[0]{\vects{\alpha}} 87 | \newcommand{\vmu}[0]{\vects{\mu}} 88 | \newcommand{\vzero}[0]{\vect{0}} 89 | \newcommand{\tf}[0]{\text{m}} 90 | \newcommand{\tdf}[0]{\text{dm}} 91 | \newcommand{\grad}[0]{\nabla} 92 | \newcommand{\alert}[1]{\textcolor{red}{#1}} 93 | \newcommand{\N}[0]{\mathcal{N}} 94 | \newcommand{\LL}[0]{\mathcal{L}} 95 | \newcommand{\HH}[0]{\mathcal{H}} 96 | \newcommand{\RR}[0]{\mathbb{R}} 97 | \newcommand{\II}[0]{\mathbb{I}} 98 | \newcommand{\Scal}[0]{\mathcal{S}} 99 | \newcommand{\sigmoid}{\sigma} 100 | \newcommand{\E}[0]{\mathbb{E}} 101 | \newcommand{\enabla}[0]{\ensuremath{% 102 | \overset{\raisebox{-0.3ex}[0.5ex][0ex]{% 103 | \ensuremath{\scriptscriptstyle e}}}{\nabla}}} 104 | \newcommand{\enhnabla}[0]{\nabla_{\hspace{-0.5mm}e}\,} 105 | 106 | 107 | \newcommand{\todo}[1]{{\Large\textcolor{red}{#1}}} 108 | \newcommand{\done}[1]{{\Large\textcolor{green}{#1}}} 109 | \newcommand{\dd}[1]{\ensuremath{\mbox{d}#1}} 110 | 111 | \DeclareMathOperator*{\argmax}{\arg \max} 112 | \DeclareMathOperator*{\argmin}{\arg \min} 113 | \newcommand{\newln}{\\&\quad\quad{}} 114 | 115 | \newcommand{\Ax}{\mathcal{A}_x} 116 | \newcommand{\Ay}{\mathcal{A}_y} 117 | \newcommand{\ola}{\overleftarrow} 118 | \newcommand{\ora}{\overrightarrow} 119 | \newcommand{\ov}{\overline} 120 | \newcommand{\ts}{\rule{0pt}{2.6ex}} % Top strut 121 | \newcommand{\ms}{\rule{0pt}{0ex}} % Middle strut 122 | \newcommand{\bs}{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut 123 | \newcommand{\specialcell}[2][c]{% 124 | \begin{tabular}[#1]{@{}c@{}}#2\end{tabular}} 125 | 126 | \newcommand\codeHighlight[1]{\textcolor[rgb]{1,0,0}{#1}} 127 | 128 | \title{DL4MT-Tutorial: \\Conditional Gated Recurrent Unit with Attention Mechanism} 129 | \author{Orhan Firat \and Kyunghyun Cho} 130 | \date{May 15, 2016} 131 | 132 | \begin{document} 133 | 134 | \maketitle 135 | 136 | This document describes the $gru\_cond\_layer$ used in Session 2 and Session 3. 137 | 138 | Given a source sequence $(x_1, \dots,x_{T_x})$ of length $T_x$ and a target 139 | sequence $(y_1,\dots,y_{T_y})$, let $\vh_i$ be the annotation of the source symbol 140 | at position $i$, obtained by concatenating the forward and backward encoder RNN 141 | hidden states, $\vh_i = [ \ora{\vh}_i; \ola{\vh}_i ]$. A conditional GRU with attention 142 | mechanism, cGRU$_{\text{att}}$, uses it's previous hidden state $\vs_{j-1}$, the 143 | whole set of source annotations $\text{C}=\lbrace\vh_i, \dots, \vh_{T_x}\rbrace$ and 144 | the previously decoded symbol $y_{j-1}$ in order to update it's hidden state $\vs_j$, 145 | which is further used to decode symbol $y_j$ at position $j$, 146 | 147 | \begin{equation} 148 | \vs_j = \text{cGRU}_{\text{att}}\left( \vs_{j-1}, y_{j-1}, \text{C} \right). 149 | \end{equation} 150 | 151 | \paragraph{Internals} The conditional GRU layer with attention mechanism, 152 | cGRU$_{\text{att}}$, consists of three components, two 153 | recurrent cells and an attention mechanism ATT in between. 154 | First recurrent cell $\text{REC}_1$, combines the previous decoded symbol $y_{j-1}$ 155 | and previous hidden state $\vs_{j-1}$ in order to generate an intermediate 156 | representation $\vs^{\prime}_j$ with the following formulations: 157 | 158 | \vspace{-10px} 159 | \begin{align} 160 | \vs_j^{\prime} = \text{REC}_1 & \left( y_{j-1}, \vs_{j-1} \right) = (1 - \vz_j^{\prime}) \odot \underline{\vs}_j^{\prime} + \vz_j^{\prime} \odot \vs_{j-1}, \\ 161 | \underline{\vs}_j^{\prime} =& ~\text{tanh} \left( \mW^{\prime} \mE[y_{j-1}] + \vr_j^{\prime} \odot (\mU^{\prime}\vs_{j-1}) \right), \\ 162 | \vr_j^{\prime} =& ~ \sigma \left( \mW_r^{\prime} \mE[y_{j-1}] + \mU_r^{\prime} \vs_{j-1} \right), \\ 163 | \vz_j^{\prime} =& ~ \sigma \left( \mW_z^{\prime} \mE[y_{j-1}] + \mU_z^{\prime} \vs_{j-1} \right), 164 | \end{align} 165 | 166 | \noindent where $\mE$ is the target word embedding matrix, 167 | $\underline{\vs}_j^{\prime}$ is the proposal intermediate representation, $\vr_j^{\prime}$ 168 | and $\vz_j^{\prime}$ being the reset and update gate activations. In this formulation, 169 | $\mW^{\prime}$, $\mU^{\prime}$, $\mW_r^{\prime}$, $\mU_r^{\prime}$, 170 | $\mW_z^{\prime}$, $\mU_z^{\prime}$ are trained model parameters\footnote{All 171 | the biases are omitted for simplicity.} tanh and $\sigma$ are hyperbolic tangent 172 | and logistic sigmoid activation functions respectively. 173 | 174 | The attention mechanism ATT, inputs the entire context set C along with 175 | intermediate hidden state $\vs_j^{\prime}$ in order to compute the context vector 176 | $\vc_j$ as follows: 177 | 178 | \begin{align} 179 | \vc_j =& \text{ATT} \left( \text{C}, \vs_j^{\prime} \right) = \sum_i^{T_x} \alpha_{ij} \vh_i , \\ 180 | \alpha_{ij} & = \frac{\text{exp}(e_{ij})}{\sum_{k=1}^{Tx} \text{exp}(e_{kj}) } ,\\ 181 | e_{ij} =& \vv_a^{\intercal} \text{tanh} \left( \mU_a \vs_j^{(1)} + \mW_a \vh_i \right) , 182 | \end{align} 183 | 184 | \noindent where $\alpha_{ij}$ is the normalized alignment weight between source 185 | symbol at position $i$ and target symbol at position $j$ and $\vv_a, \mU_a, \mW_a$ 186 | are the trained model parameters. 187 | 188 | Finally, the second recurrent cell $\text{REC}_2$, generates $\vs_j$, the hidden state of 189 | the $\text{cGRU}_{\text{att}}$, by looking at intermediate representation 190 | $\vs_j^{\prime}$ and context vector $\vc_j$ with the following formulations: 191 | 192 | \begin{align} 193 | \vs_j = \text{REC}_2 & \left( \vs_j^{\prime}, \vc_j \right) = (1 - \vz_j) \odot \underline{\vs}_j + \vz_j \odot \vs_j^{\prime}, \\ 194 | \underline{\vs}_j =& \text{tanh} \left( \mW \vc_j + \vr_j \odot (\mU \vs_j^{\prime} ) \right) ,\\ 195 | \vr_j =& \sigma \left( \mW_r \vc_j + \mU_r \vs_j^{\prime} \right), \\ 196 | \vz_j =& \sigma \left( \mW_z \vc_j + \mU_z \vs_j^{\prime} \right), 197 | \end{align} 198 | 199 | \noindent similarly, $\underline{\vs}_j$ being the proposal hidden state, 200 | $\vr_j$ and $\vz_j$ being the reset and update gate activations with the 201 | trained model parameters $\mW, \mU, \mW_r, \mU_r, 202 | \mW_z, \mU_z$. 203 | 204 | \end{document} 205 | -------------------------------------------------------------------------------- /session0/data_iterator.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import gzip 3 | 4 | 5 | class TextIterator: 6 | def __init__(self, source, 7 | source_dict, 8 | batch_size=128, 9 | maxlen=100, 10 | n_words_source=-1): 11 | if source.endswith('.gz'): 12 | self.source = gzip.open(source, 'r') 13 | else: 14 | self.source = open(source, 'r') 15 | with open(source_dict, 'rb') as f: 16 | self.source_dict = pkl.load(f) 17 | 18 | self.batch_size = batch_size 19 | self.maxlen = maxlen 20 | 21 | self.n_words_source = n_words_source 22 | 23 | self.end_of_data = False 24 | 25 | def __iter__(self): 26 | return self 27 | 28 | def reset(self): 29 | self.source.seek(0) 30 | 31 | def next(self): 32 | if self.end_of_data: 33 | self.end_of_data = False 34 | self.reset() 35 | raise StopIteration 36 | 37 | source = [] 38 | 39 | try: 40 | 41 | # actual work here 42 | while True: 43 | ss = self.source.readline() 44 | if ss == "": 45 | raise IOError 46 | ss = ss.strip().split() 47 | ss = [self.source_dict[w] if w in self.source_dict else 1 48 | for w in ss] 49 | if self.n_words_source > 0: 50 | ss = [w if w < self.n_words_source else 1 for w in ss] 51 | 52 | if len(ss) > self.maxlen: 53 | continue 54 | 55 | source.append(ss) 56 | 57 | if len(source) >= self.batch_size: 58 | break 59 | except IOError: 60 | self.end_of_data = True 61 | 62 | if len(source) <= 0: 63 | self.end_of_data = False 64 | self.reset() 65 | raise StopIteration 66 | 67 | return source 68 | -------------------------------------------------------------------------------- /session0/lm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a simple neural language model using GRU units 3 | ''' 4 | import theano 5 | import theano.tensor as tensor 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | 8 | import cPickle as pkl 9 | import ipdb 10 | import numpy 11 | import copy 12 | 13 | import os 14 | import warnings 15 | import sys 16 | import time 17 | 18 | from collections import OrderedDict 19 | 20 | from data_iterator import TextIterator 21 | 22 | profile = False 23 | 24 | 25 | # push parameters to Theano shared variables 26 | def zipp(params, tparams): 27 | for kk, vv in params.iteritems(): 28 | tparams[kk].set_value(vv) 29 | 30 | 31 | # pull parameters from Theano shared variables 32 | def unzip(zipped): 33 | new_params = OrderedDict() 34 | for kk, vv in zipped.iteritems(): 35 | new_params[kk] = vv.get_value() 36 | return new_params 37 | 38 | 39 | # get the list of parameters: Note that tparams must be OrderedDict 40 | def itemlist(tparams): 41 | return [vv for kk, vv in tparams.iteritems()] 42 | 43 | 44 | # dropout 45 | def dropout_layer(state_before, use_noise, trng): 46 | proj = tensor.switch( 47 | use_noise, 48 | state_before * trng.binomial(state_before.shape, p=0.5, n=1, 49 | dtype=state_before.dtype), 50 | state_before * 0.5) 51 | return proj 52 | 53 | 54 | # make prefix-appended name 55 | def _p(pp, name): 56 | return '%s_%s' % (pp, name) 57 | 58 | 59 | # initialize Theano shared variables according to the initial parameters 60 | def init_tparams(params): 61 | tparams = OrderedDict() 62 | for kk, pp in params.iteritems(): 63 | tparams[kk] = theano.shared(params[kk], name=kk) 64 | return tparams 65 | 66 | 67 | # load parameters 68 | def load_params(path, params): 69 | pp = numpy.load(path) 70 | for kk, vv in params.iteritems(): 71 | if kk not in pp: 72 | warnings.warn('%s is not in the archive' % kk) 73 | continue 74 | params[kk] = pp[kk] 75 | 76 | return params 77 | 78 | 79 | # layers: 'name': ('parameter initializer', 'feedforward') 80 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 81 | 'gru': ('param_init_gru', 'gru_layer'), 82 | } 83 | 84 | 85 | def get_layer(name): 86 | fns = layers[name] 87 | return (eval(fns[0]), eval(fns[1])) 88 | 89 | 90 | # orthogonal initialization for weights 91 | # see Saxe et al. ICLR'14 92 | def ortho_weight(ndim): 93 | W = numpy.random.randn(ndim, ndim) 94 | u, s, v = numpy.linalg.svd(W) 95 | return u.astype('float32') 96 | 97 | 98 | # weight initializer, normal by default 99 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 100 | if nout is None: 101 | nout = nin 102 | if nout == nin and ortho: 103 | W = ortho_weight(nin) 104 | else: 105 | W = scale * numpy.random.randn(nin, nout) 106 | return W.astype('float32') 107 | 108 | 109 | def tanh(x): 110 | return tensor.tanh(x) 111 | 112 | 113 | def linear(x): 114 | return x 115 | 116 | 117 | def concatenate(tensor_list, axis=0): 118 | """ 119 | Alternative implementation of `theano.tensor.concatenate`. 120 | This function does exactly the same thing, but contrary to Theano's own 121 | implementation, the gradient is implemented on the GPU. 122 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 123 | because the inverse operation (splitting) needs to be done on the CPU. 124 | This implementation does not have that problem. 125 | :usage: 126 | >>> x, y = theano.tensor.matrices('x', 'y') 127 | >>> c = concatenate([x, y], axis=1) 128 | :parameters: 129 | - tensor_list : list 130 | list of Theano tensor expressions that should be concatenated. 131 | - axis : int 132 | the tensors will be joined along this axis. 133 | :returns: 134 | - out : tensor 135 | the concatenated tensor expression. 136 | """ 137 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 138 | 139 | output_shape = () 140 | for k in range(axis): 141 | output_shape += (tensor_list[0].shape[k],) 142 | output_shape += (concat_size,) 143 | for k in range(axis + 1, tensor_list[0].ndim): 144 | output_shape += (tensor_list[0].shape[k],) 145 | 146 | out = tensor.zeros(output_shape) 147 | offset = 0 148 | for tt in tensor_list: 149 | indices = () 150 | for k in range(axis): 151 | indices += (slice(None),) 152 | indices += (slice(offset, offset + tt.shape[axis]),) 153 | for k in range(axis + 1, tensor_list[0].ndim): 154 | indices += (slice(None),) 155 | 156 | out = tensor.set_subtensor(out[indices], tt) 157 | offset += tt.shape[axis] 158 | 159 | return out 160 | 161 | 162 | # batch preparation, returns padded batch and mask 163 | def prepare_data(seqs_x, maxlen=None, n_words=30000): 164 | # x: a list of sentences 165 | lengths_x = [len(s) for s in seqs_x] 166 | 167 | # filter according to mexlen 168 | if maxlen is not None: 169 | new_seqs_x = [] 170 | new_lengths_x = [] 171 | for l_x, s_x in zip(lengths_x, seqs_x): 172 | if l_x < maxlen: 173 | new_seqs_x.append(s_x) 174 | new_lengths_x.append(l_x) 175 | lengths_x = new_lengths_x 176 | seqs_x = new_seqs_x 177 | 178 | if len(lengths_x) < 1: 179 | return None, None, None, None 180 | 181 | n_samples = len(seqs_x) 182 | maxlen_x = numpy.max(lengths_x) + 1 183 | 184 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 185 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 186 | for idx, s_x in enumerate(seqs_x): 187 | x[:lengths_x[idx], idx] = s_x 188 | x_mask[:lengths_x[idx]+1, idx] = 1. 189 | 190 | return x, x_mask 191 | 192 | 193 | # feedforward layer: affine transformation + point-wise nonlinearity 194 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, 195 | ortho=True): 196 | if nin is None: 197 | nin = options['dim_proj'] 198 | if nout is None: 199 | nout = options['dim_proj'] 200 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) 201 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') 202 | 203 | return params 204 | 205 | 206 | def fflayer(tparams, state_below, options, prefix='rconv', 207 | activ='lambda x: tensor.tanh(x)', **kwargs): 208 | return eval(activ)( 209 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 210 | tparams[_p(prefix, 'b')]) 211 | 212 | 213 | # GRU layer 214 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 215 | 216 | if nin is None: 217 | nin = options['dim_proj'] 218 | if dim is None: 219 | dim = options['dim_proj'] 220 | 221 | # embedding to gates transformation weights, biases 222 | W = numpy.concatenate([norm_weight(nin, dim), 223 | norm_weight(nin, dim)], axis=1) 224 | params[_p(prefix, 'W')] = W 225 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 226 | 227 | # recurrent transformation weights for gates 228 | U = numpy.concatenate([ortho_weight(dim), 229 | ortho_weight(dim)], axis=1) 230 | params[_p(prefix, 'U')] = U 231 | 232 | # embedding to hidden state proposal weights, biases 233 | Wx = norm_weight(nin, dim) 234 | params[_p(prefix, 'Wx')] = Wx 235 | params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 236 | 237 | # recurrent transformation weights for hidden state proposal 238 | Ux = ortho_weight(dim) 239 | params[_p(prefix, 'Ux')] = Ux 240 | 241 | return params 242 | 243 | 244 | def gru_layer(tparams, state_below, options, prefix='gru', 245 | mask=None, one_step=False, init_state=None, **kwargs): 246 | if one_step: 247 | assert init_state, 'previous state must be provided' 248 | 249 | nsteps = state_below.shape[0] 250 | 251 | if state_below.ndim == 3: 252 | n_samples = state_below.shape[1] 253 | else: 254 | n_samples = state_below.shape[0] 255 | 256 | dim = tparams[_p(prefix, 'Ux')].shape[1] 257 | 258 | if mask is None: 259 | mask = tensor.alloc(1., state_below.shape[0], 1) 260 | 261 | # utility function to slice a tensor 262 | def _slice(_x, n, dim): 263 | if _x.ndim == 3: 264 | return _x[:, :, n*dim:(n+1)*dim] 265 | return _x[:, n*dim:(n+1)*dim] 266 | 267 | # state_below is the input word embeddings 268 | # input to the gates, concatenated 269 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 270 | tparams[_p(prefix, 'b')] 271 | # input to compute the hidden state proposal 272 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 273 | tparams[_p(prefix, 'bx')] 274 | 275 | # step function to be used by scan 276 | # arguments | sequences |outputs-info| non-seqs 277 | def _step_slice(m_, x_, xx_, h_, U, Ux): 278 | preact = tensor.dot(h_, U) 279 | preact += x_ 280 | 281 | # reset and update gates 282 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 283 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 284 | 285 | # compute the hidden state proposal 286 | preactx = tensor.dot(h_, Ux) 287 | preactx = preactx * r 288 | preactx = preactx + xx_ 289 | 290 | # hidden state proposal 291 | h = tensor.tanh(preactx) 292 | 293 | # leaky integrate and obtain next hidden state 294 | h = u * h_ + (1. - u) * h 295 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 296 | 297 | return h 298 | 299 | # prepare scan arguments 300 | seqs = [mask, state_below_, state_belowx] 301 | _step = _step_slice 302 | shared_vars = [tparams[_p(prefix, 'U')], 303 | tparams[_p(prefix, 'Ux')]] 304 | 305 | # set initial state to all zeros 306 | if init_state is None: 307 | init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0) 308 | 309 | if one_step: # sampling 310 | rval = _step(*(seqs+[init_state]+shared_vars)) 311 | else: # training 312 | rval, updates = theano.scan(_step, 313 | sequences=seqs, 314 | outputs_info=[init_state], 315 | non_sequences=shared_vars, 316 | name=_p(prefix, '_layers'), 317 | n_steps=nsteps, 318 | profile=profile, 319 | strict=True) 320 | rval = [rval] 321 | return rval 322 | 323 | 324 | # initialize all parameters 325 | def init_params(options): 326 | params = OrderedDict() 327 | # embedding 328 | params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) 329 | params = get_layer(options['encoder'])[0](options, params, 330 | prefix='encoder', 331 | nin=options['dim_word'], 332 | dim=options['dim']) 333 | # readout 334 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 335 | nin=options['dim'], nout=options['dim_word'], 336 | ortho=False) 337 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 338 | nin=options['dim_word'], 339 | nout=options['dim_word'], ortho=False) 340 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 341 | nin=options['dim_word'], 342 | nout=options['n_words']) 343 | 344 | return params 345 | 346 | 347 | # build a training model 348 | def build_model(tparams, options): 349 | opt_ret = dict() 350 | 351 | trng = RandomStreams(1234) 352 | use_noise = theano.shared(numpy.float32(0.)) 353 | 354 | # description string: #words x #samples 355 | x = tensor.matrix('x', dtype='int64') 356 | x_mask = tensor.matrix('x_mask', dtype='float32') 357 | 358 | n_timesteps = x.shape[0] 359 | n_samples = x.shape[1] 360 | 361 | # input 362 | emb = tparams['Wemb'][x.flatten()] 363 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 364 | emb_shifted = tensor.zeros_like(emb) 365 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 366 | emb = emb_shifted 367 | opt_ret['emb'] = emb 368 | 369 | # pass through gru layer, recurrence here 370 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 371 | prefix='encoder', 372 | mask=x_mask) 373 | proj_h = proj[0] 374 | opt_ret['proj_h'] = proj_h 375 | 376 | # compute word probabilities 377 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options, 378 | prefix='ff_logit_lstm', activ='linear') 379 | logit_prev = get_layer('ff')[1](tparams, emb, options, 380 | prefix='ff_logit_prev', activ='linear') 381 | logit = tensor.tanh(logit_lstm+logit_prev) 382 | logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', 383 | activ='linear') 384 | logit_shp = logit.shape 385 | probs = tensor.nnet.softmax( 386 | logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 387 | 388 | # cost 389 | x_flat = x.flatten() 390 | x_flat_idx = tensor.arange(x_flat.shape[0]) * options['n_words'] + x_flat 391 | cost = -tensor.log(probs.flatten()[x_flat_idx]) 392 | cost = cost.reshape([x.shape[0], x.shape[1]]) 393 | opt_ret['cost_per_sample'] = cost 394 | cost = (cost * x_mask).sum(0) 395 | 396 | return trng, use_noise, x, x_mask, opt_ret, cost 397 | 398 | 399 | # build a sampler 400 | def build_sampler(tparams, options, trng): 401 | # x: 1 x 1 402 | y = tensor.vector('y_sampler', dtype='int64') 403 | init_state = tensor.matrix('init_state', dtype='float32') 404 | 405 | # if it's the first word, emb should be all zero 406 | emb = tensor.switch(y[:, None] < 0, 407 | tensor.alloc(0., 1, tparams['Wemb'].shape[1]), 408 | tparams['Wemb'][y]) 409 | 410 | # apply one step of gru layer 411 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 412 | prefix='encoder', 413 | mask=None, 414 | one_step=True, 415 | init_state=init_state) 416 | next_state = proj[0] 417 | 418 | # compute the output probability dist and sample 419 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 420 | prefix='ff_logit_lstm', activ='linear') 421 | logit_prev = get_layer('ff')[1](tparams, emb, options, 422 | prefix='ff_logit_prev', activ='linear') 423 | logit = tensor.tanh(logit_lstm+logit_prev) 424 | logit = get_layer('ff')[1](tparams, logit, options, 425 | prefix='ff_logit', activ='linear') 426 | next_probs = tensor.nnet.softmax(logit) 427 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 428 | 429 | # next word probability 430 | print 'Building f_next..', 431 | inps = [y, init_state] 432 | outs = [next_probs, next_sample, next_state] 433 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 434 | print 'Done' 435 | 436 | return f_next 437 | 438 | 439 | # generate sample 440 | def gen_sample(tparams, f_next, options, trng=None, maxlen=30, argmax=False): 441 | 442 | sample = [] 443 | sample_score = 0 444 | 445 | # initial token is indicated by a -1 and initial state is zero 446 | next_w = -1 * numpy.ones((1,)).astype('int64') 447 | next_state = numpy.zeros((1, options['dim'])).astype('float32') 448 | 449 | for ii in xrange(maxlen): 450 | inps = [next_w, next_state] 451 | ret = f_next(*inps) 452 | next_p, next_w, next_state = ret[0], ret[1], ret[2] 453 | 454 | if argmax: 455 | nw = next_p[0].argmax() 456 | else: 457 | nw = next_w[0] 458 | sample.append(nw) 459 | sample_score += next_p[0, nw] 460 | if nw == 0: 461 | break 462 | 463 | return sample, sample_score 464 | 465 | 466 | # calculate the log probablities on a given corpus using language model 467 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True): 468 | probs = [] 469 | 470 | n_done = 0 471 | 472 | for x in iterator: 473 | n_done += len(x) 474 | 475 | x, x_mask = prepare_data(x, n_words=options['n_words']) 476 | 477 | pprobs = f_log_probs(x, x_mask) 478 | for pp in pprobs: 479 | probs.append(pp) 480 | 481 | if numpy.isnan(numpy.mean(probs)): 482 | ipdb.set_trace() 483 | 484 | if verbose: 485 | print >>sys.stderr, '%d samples computed' % (n_done) 486 | 487 | return numpy.array(probs) 488 | 489 | 490 | # optimizers 491 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 492 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8): 493 | 494 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 495 | for k, p in tparams.iteritems()] 496 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 497 | 498 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) 499 | 500 | updates = [] 501 | 502 | t_prev = theano.shared(numpy.float32(0.)) 503 | t = t_prev + 1. 504 | lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) 505 | 506 | for p, g in zip(tparams.values(), gshared): 507 | m = theano.shared(p.get_value() * 0., p.name + '_mean') 508 | v = theano.shared(p.get_value() * 0., p.name + '_variance') 509 | m_t = beta1 * m + (1. - beta1) * g 510 | v_t = beta2 * v + (1. - beta2) * g**2 511 | step = lr_t * m_t / (tensor.sqrt(v_t) + e) 512 | p_t = p - step 513 | updates.append((m, m_t)) 514 | updates.append((v, v_t)) 515 | updates.append((p, p_t)) 516 | updates.append((t_prev, t)) 517 | 518 | f_update = theano.function([lr], [], updates=updates, 519 | on_unused_input='ignore', profile=profile) 520 | 521 | return f_grad_shared, f_update 522 | 523 | 524 | def adadelta(lr, tparams, grads, inp, cost): 525 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 526 | name='%s_grad' % k) 527 | for k, p in tparams.iteritems()] 528 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 529 | name='%s_rup2' % k) 530 | for k, p in tparams.iteritems()] 531 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 532 | name='%s_rgrad2' % k) 533 | for k, p in tparams.iteritems()] 534 | 535 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 536 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 537 | for rg2, g in zip(running_grads2, grads)] 538 | 539 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 540 | profile=profile) 541 | 542 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 543 | for zg, ru2, rg2 in zip(zipped_grads, 544 | running_up2, 545 | running_grads2)] 546 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 547 | for ru2, ud in zip(running_up2, updir)] 548 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 549 | 550 | f_update = theano.function([lr], [], updates=ru2up+param_up, 551 | on_unused_input='ignore', profile=profile) 552 | 553 | return f_grad_shared, f_update 554 | 555 | 556 | def rmsprop(lr, tparams, grads, inp, cost): 557 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 558 | name='%s_grad' % k) 559 | for k, p in tparams.iteritems()] 560 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 561 | name='%s_rgrad' % k) 562 | for k, p in tparams.iteritems()] 563 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 564 | name='%s_rgrad2' % k) 565 | for k, p in tparams.iteritems()] 566 | 567 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 568 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 569 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 570 | for rg2, g in zip(running_grads2, grads)] 571 | 572 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 573 | profile=profile) 574 | 575 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 576 | name='%s_updir' % k) 577 | for k, p in tparams.iteritems()] 578 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 579 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 580 | running_grads2)] 581 | param_up = [(p, p + udn[1]) 582 | for p, udn in zip(itemlist(tparams), updir_new)] 583 | f_update = theano.function([lr], [], updates=updir_new+param_up, 584 | on_unused_input='ignore', profile=profile) 585 | 586 | return f_grad_shared, f_update 587 | 588 | 589 | def sgd(lr, tparams, grads, x, mask, y, cost): 590 | 591 | # allocate gradients and set them all to zero 592 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 593 | for k, p in tparams.iteritems()] 594 | 595 | # create gradient copying list, 596 | # from grads (tensor variable) to gshared (shared variable) 597 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 598 | 599 | # compile theano function to compute cost and copy gradients 600 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 601 | profile=profile) 602 | 603 | # define the update step rule 604 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 605 | 606 | # compile a function for update 607 | f_update = theano.function([lr], [], updates=pup, profile=profile) 608 | 609 | return f_grad_shared, f_update 610 | 611 | 612 | def train(dim_word=100, # word vector dimensionality 613 | dim=1000, # the number of GRU units 614 | encoder='gru', 615 | patience=10, # early stopping patience 616 | max_epochs=5000, 617 | finish_after=10000000, # finish after this many updates 618 | dispFreq=100, 619 | decay_c=0., # L2 weight decay penalty 620 | lrate=0.01, 621 | n_words=100000, # vocabulary size 622 | maxlen=100, # maximum length of the description 623 | optimizer='rmsprop', 624 | batch_size=16, 625 | valid_batch_size=16, 626 | saveto='model.npz', 627 | validFreq=1000, 628 | saveFreq=1000, # save the parameters after every saveFreq updates 629 | sampleFreq=100, # generate some samples after every sampleFreq 630 | dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz', 631 | valid_dataset='../data/dev/newstest2011.en.tok', 632 | dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/' 633 | 'wiki.tok.txt.gz.pkl', 634 | use_dropout=False, 635 | reload_=False): 636 | 637 | # Model options 638 | model_options = locals().copy() 639 | 640 | # load dictionary 641 | with open(dictionary, 'rb') as f: 642 | worddicts = pkl.load(f) 643 | 644 | # invert dictionary 645 | worddicts_r = dict() 646 | for kk, vv in worddicts.iteritems(): 647 | worddicts_r[vv] = kk 648 | 649 | # reload options 650 | if reload_ and os.path.exists(saveto): 651 | with open('%s.pkl' % saveto, 'rb') as f: 652 | model_options = pkl.load(f) 653 | 654 | print 'Loading data' 655 | train = TextIterator(dataset, 656 | dictionary, 657 | n_words_source=n_words, 658 | batch_size=batch_size, 659 | maxlen=maxlen) 660 | valid = TextIterator(valid_dataset, 661 | dictionary, 662 | n_words_source=n_words, 663 | batch_size=valid_batch_size, 664 | maxlen=maxlen) 665 | 666 | print 'Building model' 667 | params = init_params(model_options) 668 | 669 | # reload parameters 670 | if reload_ and os.path.exists(saveto): 671 | params = load_params(saveto, params) 672 | 673 | # create shared variables for parameters 674 | tparams = init_tparams(params) 675 | 676 | # build the symbolic computational graph 677 | trng, use_noise, \ 678 | x, x_mask, \ 679 | opt_ret, \ 680 | cost = \ 681 | build_model(tparams, model_options) 682 | inps = [x, x_mask] 683 | 684 | print 'Buliding sampler' 685 | f_next = build_sampler(tparams, model_options, trng) 686 | 687 | # before any regularizer 688 | print 'Building f_log_probs...', 689 | f_log_probs = theano.function(inps, cost, profile=profile) 690 | print 'Done' 691 | 692 | cost = cost.mean() 693 | 694 | # apply L2 regularization on weights 695 | if decay_c > 0.: 696 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 697 | weight_decay = 0. 698 | for kk, vv in tparams.iteritems(): 699 | weight_decay += (vv ** 2).sum() 700 | weight_decay *= decay_c 701 | cost += weight_decay 702 | 703 | # after any regularizer - compile the computational graph for cost 704 | print 'Building f_cost...', 705 | f_cost = theano.function(inps, cost, profile=profile) 706 | print 'Done' 707 | 708 | print 'Computing gradient...', 709 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 710 | print 'Done' 711 | 712 | # compile the optimizer, the actual computational graph is compiled here 713 | lr = tensor.scalar(name='lr') 714 | print 'Building optimizers...', 715 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 716 | print 'Done' 717 | 718 | print 'Optimization' 719 | 720 | history_errs = [] 721 | # reload history 722 | if reload_ and os.path.exists(saveto): 723 | history_errs = list(numpy.load(saveto)['history_errs']) 724 | best_p = None 725 | bad_count = 0 726 | 727 | if validFreq == -1: 728 | validFreq = len(train[0])/batch_size 729 | if saveFreq == -1: 730 | saveFreq = len(train[0])/batch_size 731 | if sampleFreq == -1: 732 | sampleFreq = len(train[0])/batch_size 733 | 734 | # Training loop 735 | uidx = 0 736 | estop = False 737 | bad_counter = 0 738 | for eidx in xrange(max_epochs): 739 | n_samples = 0 740 | 741 | for x in train: 742 | n_samples += len(x) 743 | uidx += 1 744 | use_noise.set_value(1.) 745 | 746 | # pad batch and create mask 747 | x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words) 748 | 749 | if x is None: 750 | print 'Minibatch with zero sample under length ', maxlen 751 | uidx -= 1 752 | continue 753 | 754 | ud_start = time.time() 755 | 756 | # compute cost, grads and copy grads to shared variables 757 | cost = f_grad_shared(x, x_mask) 758 | 759 | # do the update on parameters 760 | f_update(lrate) 761 | 762 | ud = time.time() - ud_start 763 | 764 | # check for bad numbers 765 | if numpy.isnan(cost) or numpy.isinf(cost): 766 | print 'NaN detected' 767 | return 1. 768 | 769 | # verbose 770 | if numpy.mod(uidx, dispFreq) == 0: 771 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 772 | 773 | # save the best model so far 774 | if numpy.mod(uidx, saveFreq) == 0: 775 | print 'Saving...', 776 | 777 | if best_p is not None: 778 | params = best_p 779 | else: 780 | params = unzip(tparams) 781 | numpy.savez(saveto, history_errs=history_errs, **params) 782 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) 783 | print 'Done' 784 | 785 | # generate some samples with the model and display them 786 | if numpy.mod(uidx, sampleFreq) == 0: 787 | # FIXME: random selection? 788 | for jj in xrange(5): 789 | sample, score = gen_sample(tparams, f_next, 790 | model_options, trng=trng, 791 | maxlen=30, argmax=False) 792 | print 'Sample ', jj, ': ', 793 | ss = sample 794 | for vv in ss: 795 | if vv == 0: 796 | break 797 | if vv in worddicts_r: 798 | print worddicts_r[vv], 799 | else: 800 | print 'UNK', 801 | print 802 | 803 | # validate model on validation set and early stop if necessary 804 | if numpy.mod(uidx, validFreq) == 0: 805 | use_noise.set_value(0.) 806 | valid_errs = pred_probs(f_log_probs, prepare_data, 807 | model_options, valid) 808 | valid_err = valid_errs.mean() 809 | history_errs.append(valid_err) 810 | 811 | if uidx == 0 or valid_err <= numpy.array(history_errs).min(): 812 | best_p = unzip(tparams) 813 | bad_counter = 0 814 | if len(history_errs) > patience and valid_err >= \ 815 | numpy.array(history_errs)[:-patience].min(): 816 | bad_counter += 1 817 | if bad_counter > patience: 818 | print 'Early Stop!' 819 | estop = True 820 | break 821 | 822 | if numpy.isnan(valid_err): 823 | ipdb.set_trace() 824 | 825 | print 'Valid ', valid_err 826 | 827 | # finish after this many updates 828 | if uidx >= finish_after: 829 | print 'Finishing after %d iterations!' % uidx 830 | estop = True 831 | break 832 | 833 | print 'Seen %d samples' % n_samples 834 | 835 | if estop: 836 | break 837 | 838 | if best_p is not None: 839 | zipp(best_p, tparams) 840 | 841 | use_noise.set_value(0.) 842 | valid_err = pred_probs(f_log_probs, prepare_data, 843 | model_options, valid).mean() 844 | 845 | print 'Valid ', valid_err 846 | 847 | params = copy.copy(best_p) 848 | numpy.savez(saveto, zipped_params=best_p, 849 | history_errs=history_errs, 850 | **params) 851 | 852 | return valid_err 853 | 854 | 855 | if __name__ == '__main__': 856 | pass 857 | -------------------------------------------------------------------------------- /session0/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=48:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_lm.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /session0/train_lm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from lm import train 4 | 5 | 6 | def main(job_id, params): 7 | print params 8 | validerr = train( 9 | saveto=params['model'][0], 10 | reload_=params['reload'][0], 11 | dim_word=params['dim_word'][0], 12 | dim=params['dim'][0], 13 | n_words=params['n-words'][0], 14 | decay_c=params['decay-c'][0], 15 | lrate=params['learning-rate'][0], 16 | optimizer=params['optimizer'][0], 17 | maxlen=30, 18 | batch_size=32, 19 | valid_batch_size=16, 20 | validFreq=5000, 21 | dispFreq=10, 22 | saveFreq=1000, 23 | sampleFreq=1000, 24 | dataset='/ichec/work/dl4mt_data/nec_files/wiki.tok.txt.gz', 25 | valid_dataset='/ichec/work/dl4mt_data/nec_files/newstest2011.en.tok', 26 | dictionary='/ichec/work/dl4mt_data/nec_files/wiki.tok.txt.gz.pkl', 27 | use_dropout=params['use-dropout'][0]) 28 | return validerr 29 | 30 | if __name__ == '__main__': 31 | main(0, { 32 | 'model': ['/ichec/home/users/%s/models/model_session0.npz' % 33 | os.environ['USER']], 34 | 'dim_word': [512], 35 | 'dim': [1024], 36 | 'n-words': [30000], 37 | 'optimizer': ['adadelta'], 38 | 'decay-c': [0.], 39 | 'use-dropout': [False], 40 | 'learning-rate': [0.0001], 41 | 'reload': [False]}) 42 | -------------------------------------------------------------------------------- /session1/README.md: -------------------------------------------------------------------------------- 1 | Simple encoder-decoder model for machine translation 2 | 3 | -------------------------------------------------------------------------------- /session1/data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import cPickle as pkl 4 | import gzip 5 | 6 | 7 | def fopen(filename, mode='r'): 8 | if filename.endswith('.gz'): 9 | return gzip.open(filename, mode) 10 | return open(filename, mode) 11 | 12 | 13 | class TextIterator: 14 | """Simple Bitext iterator.""" 15 | def __init__(self, source, target, 16 | source_dict, target_dict, 17 | batch_size=128, 18 | maxlen=100, 19 | n_words_source=-1, 20 | n_words_target=-1): 21 | self.source = fopen(source, 'r') 22 | self.target = fopen(target, 'r') 23 | with open(source_dict, 'rb') as f: 24 | self.source_dict = pkl.load(f) 25 | with open(target_dict, 'rb') as f: 26 | self.target_dict = pkl.load(f) 27 | 28 | self.batch_size = batch_size 29 | self.maxlen = maxlen 30 | 31 | self.n_words_source = n_words_source 32 | self.n_words_target = n_words_target 33 | 34 | self.source_buffer = [] 35 | self.target_buffer = [] 36 | self.k = batch_size * 20 37 | 38 | self.end_of_data = False 39 | 40 | def __iter__(self): 41 | return self 42 | 43 | def reset(self): 44 | self.source.seek(0) 45 | self.target.seek(0) 46 | 47 | def next(self): 48 | if self.end_of_data: 49 | self.end_of_data = False 50 | self.reset() 51 | raise StopIteration 52 | 53 | source = [] 54 | target = [] 55 | 56 | # fill buffer, if it's empty 57 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 58 | 59 | if len(self.source_buffer) == 0: 60 | for k_ in xrange(self.k): 61 | ss = self.source.readline() 62 | if ss == "": 63 | break 64 | tt = self.target.readline() 65 | if tt == "": 66 | break 67 | 68 | self.source_buffer.append(ss.strip().split()) 69 | self.target_buffer.append(tt.strip().split()) 70 | 71 | # sort by target buffer 72 | tlen = numpy.array([len(t) for t in self.target_buffer]) 73 | tidx = tlen.argsort() 74 | 75 | _sbuf = [self.source_buffer[i] for i in tidx] 76 | _tbuf = [self.target_buffer[i] for i in tidx] 77 | 78 | self.source_buffer = _sbuf 79 | self.target_buffer = _tbuf 80 | 81 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 82 | self.end_of_data = False 83 | self.reset() 84 | raise StopIteration 85 | 86 | try: 87 | 88 | # actual work here 89 | while True: 90 | 91 | # read from source file and map to word index 92 | try: 93 | ss = self.source_buffer.pop() 94 | except IndexError: 95 | break 96 | ss = [self.source_dict[w] if w in self.source_dict else 1 97 | for w in ss] 98 | if self.n_words_source > 0: 99 | ss = [w if w < self.n_words_source else 1 for w in ss] 100 | 101 | # read from source file and map to word index 102 | tt = self.target_buffer.pop() 103 | tt = [self.target_dict[w] if w in self.target_dict else 1 104 | for w in tt] 105 | if self.n_words_target > 0: 106 | tt = [w if w < self.n_words_target else 1 for w in tt] 107 | 108 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 109 | continue 110 | 111 | source.append(ss) 112 | target.append(tt) 113 | 114 | if len(source) >= self.batch_size or \ 115 | len(target) >= self.batch_size: 116 | break 117 | except IOError: 118 | self.end_of_data = True 119 | 120 | if len(source) <= 0 or len(target) <= 0: 121 | self.end_of_data = False 122 | self.reset() 123 | raise StopIteration 124 | 125 | return source, target 126 | -------------------------------------------------------------------------------- /session1/encode.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Encode a source file using the encoder of a trained translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import cPickle as pkl 8 | 9 | from nmt import (build_sampler, gen_sample, load_params, 10 | init_params, init_tparams) 11 | 12 | from multiprocessing import Process, Queue 13 | 14 | 15 | def encode_model(queue, rqueue, pid, model, options): 16 | 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | trng = RandomStreams(1234) 19 | 20 | # allocate model parameters 21 | params = init_params(options) 22 | 23 | # load model parameters and set theano shared variables 24 | params = load_params(model, params) 25 | tparams = init_tparams(params) 26 | 27 | # word index 28 | f_init, f_next = build_sampler(tparams, options, trng) 29 | 30 | def _encode(seq): 31 | # encode the source sentence 32 | code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] 33 | return code 34 | 35 | while True: 36 | req = queue.get() 37 | if req is None: 38 | break 39 | 40 | idx, x = req[0], req[1] 41 | print pid, '-', idx 42 | cod = _encode(x) 43 | 44 | rqueue.put((idx, cod)) 45 | 46 | return 47 | 48 | 49 | def main(model, dictionary, source_file, saveto, 50 | n_process=5, chr_level=False): 51 | 52 | # load model model_options 53 | with open('%s.pkl' % model, 'rb') as f: 54 | options = pkl.load(f) 55 | 56 | # load source dictionary and invert 57 | with open(dictionary, 'rb') as f: 58 | word_dict = pkl.load(f) 59 | word_idict = dict() 60 | for kk, vv in word_dict.iteritems(): 61 | word_idict[vv] = kk 62 | word_idict[0] = '' 63 | word_idict[1] = 'UNK' 64 | 65 | # create input and output queues for processes 66 | queue = Queue() 67 | rqueue = Queue() 68 | processes = [None] * n_process 69 | for midx in xrange(n_process): 70 | processes[midx] = Process( 71 | target=encode_model, 72 | args=(queue, rqueue, midx, model, options,)) 73 | processes[midx].start() 74 | 75 | def _send_jobs(fname): 76 | with open(fname, 'r') as f: 77 | for idx, line in enumerate(f): 78 | if chr_level: 79 | words = list(line.decode('utf-8').strip()) 80 | else: 81 | words = line.strip().split() 82 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 83 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 84 | x += [0] 85 | queue.put((idx, x)) 86 | return idx+1 87 | 88 | def _finish_processes(): 89 | for midx in xrange(n_process): 90 | queue.put(None) 91 | 92 | def _retrieve_jobs(n_samples): 93 | codes = [None] * n_samples 94 | for idx in xrange(n_samples): 95 | resp = rqueue.get() 96 | codes[resp[0]] = resp[1] 97 | if numpy.mod(idx, 10) == 0: 98 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 99 | return codes 100 | 101 | print 'Translating ', source_file, '...' 102 | n_samples = _send_jobs(source_file) 103 | codes = numpy.array(_retrieve_jobs(n_samples)) 104 | _finish_processes() 105 | if not saveto.endswith('npy'): 106 | saveto = saveto + '.npy' 107 | numpy.save(saveto, codes) 108 | print 'Done' 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument('-c', action="store_true", default=False) 114 | parser.add_argument('-p', type=int, default=4) 115 | parser.add_argument('model', type=str) 116 | parser.add_argument('dictionary', type=str) 117 | parser.add_argument('source', type=str) 118 | parser.add_argument('saveto', type=str) 119 | 120 | args = parser.parse_args() 121 | 122 | main(args.model, args.dictionary, args.source, 123 | args.saveto, n_process=args.p, chr_level=args.c) 124 | -------------------------------------------------------------------------------- /session1/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=4:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./translate.py -n -p 20 \ 12 | $HOME/models/model_session1.npz \ 13 | $HOME/data/europarl-v7.fr-en.en.tok.pkl \ 14 | $HOME/data/europarl-v7.fr-en.fr.tok.pkl \ 15 | $HOME/data/newstest2011.en.tok \ 16 | ./newstest2011.trans.fr.tok 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /session1/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /session1/train_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt_all.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /session1/train_nmt.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from nmt import train 5 | 6 | def main(job_id, params): 7 | print params 8 | validerr = train(saveto=params['model'][0], 9 | reload_=params['reload'][0], 10 | dim_word=params['dim_word'][0], 11 | dim=params['dim'][0], 12 | n_words=params['n-words'][0], 13 | n_words_src=params['n-words'][0], 14 | decay_c=params['decay-c'][0], 15 | lrate=params['learning-rate'][0], 16 | optimizer=params['optimizer'][0], 17 | maxlen=50, 18 | batch_size=32, 19 | valid_batch_size=32, 20 | datasets=['/ichec/home/users/%s/data/europarl-v7.fr-en.en.tok'%os.environ['USER'], 21 | '/ichec/home/users/%s/data/europarl-v7.fr-en.fr.tok'%os.environ['USER']], 22 | valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'], 23 | '/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']], 24 | dictionaries=['/ichec/home/users/%s/data/europarl-v7.fr-en.en.tok.pkl'%os.environ['USER'], 25 | '/ichec/home/users/%s/data/europarl-v7.fr-en.fr.tok.pkl'%os.environ['USER']], 26 | validFreq=5000, 27 | dispFreq=10, 28 | saveFreq=5000, 29 | sampleFreq=1000, 30 | use_dropout=params['use-dropout'][0], 31 | overwrite=False) 32 | return validerr 33 | 34 | if __name__ == '__main__': 35 | main(0, { 36 | 'model': ['/ichec/home/users/%s/models/model_session1.npz'%os.environ['USER']], 37 | 'dim_word': [500], 38 | 'dim': [1024], 39 | 'n-words': [30000], 40 | 'optimizer': ['adadelta'], 41 | 'decay-c': [0.], 42 | 'use-dropout': [False], 43 | 'learning-rate': [0.0001], 44 | 'reload': [False]}) 45 | 46 | 47 | -------------------------------------------------------------------------------- /session1/train_nmt_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from nmt import train 4 | 5 | 6 | def main(job_id, params): 7 | print params 8 | username = os.environ['USER'] 9 | validerr = train( 10 | saveto=params['model'][0], 11 | reload_=params['reload'][0], 12 | dim_word=params['dim_word'][0], 13 | dim=params['dim'][0], 14 | n_words=params['n-words'][0], 15 | n_words_src=params['n-words'][0], 16 | decay_c=params['decay-c'][0], 17 | lrate=params['learning-rate'][0], 18 | optimizer=params['optimizer'][0], 19 | maxlen=50, 20 | batch_size=32, 21 | valid_batch_size=32, 22 | datasets=[ 23 | '/ichec/home/users/%s/data/all.en.concat.shuf.gz' % username, 24 | '/ichec/home/users/%s/data/all.fr.concat.shuf.gz' % username], 25 | valid_datasets=[ 26 | '/ichec/home/users/%s/data/newstest2011.en.tok' % username, 27 | '/ichec/home/users/%s/data/newstest2011.fr.tok' % username], 28 | dictionaries=[ 29 | '/ichec/home/users/%s/data/all.en.concat.gz.pkl' % username, 30 | '/ichec/home/users/%s/data/all.fr.concat.gz.pkl' % username], 31 | validFreq=5000, 32 | dispFreq=10, 33 | saveFreq=5000, 34 | sampleFreq=1000, 35 | use_dropout=params['use-dropout'][0], 36 | overwrite=False) 37 | return validerr 38 | 39 | if __name__ == '__main__': 40 | main(0, { 41 | 'model': [ 42 | '/ichec/home/users/%s/models/model_session1_all.npz' % 43 | os.environ['USER']], 44 | 'dim_word': [500], 45 | 'dim': [1024], 46 | 'n-words': [30000], 47 | 'optimizer': ['adadelta'], 48 | 'decay-c': [0.], 49 | 'use-dropout': [False], 50 | 'learning-rate': [0.0001], 51 | 'reload': [False]}) 52 | -------------------------------------------------------------------------------- /session1/translate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import cPickle as pkl 8 | 9 | from nmt import (build_sampler, gen_sample, load_params, 10 | init_params, init_tparams) 11 | 12 | from multiprocessing import Process, Queue 13 | 14 | 15 | def translate_model(queue, rqueue, pid, model, options, k, normalize): 16 | 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | from theano import shared 19 | trng = RandomStreams(1234) 20 | use_noise = shared(numpy.float32(0.)) 21 | 22 | # allocate model parameters 23 | params = init_params(options) 24 | 25 | # load model parameters and set theano shared variables 26 | params = load_params(model, params) 27 | tparams = init_tparams(params) 28 | 29 | # word index 30 | f_init, f_next = build_sampler(tparams, options, trng, use_noise) 31 | 32 | def _translate(seq): 33 | # sample given an input sequence and obtain scores 34 | sample, score = gen_sample(tparams, f_init, f_next, 35 | numpy.array(seq).reshape([len(seq), 1]), 36 | options, trng=trng, k=k, maxlen=200, 37 | stochastic=False) 38 | 39 | # normalize scores according to sequence lengths 40 | if normalize: 41 | lengths = numpy.array([len(s) for s in sample]) 42 | score = score / lengths 43 | sidx = numpy.argmin(score) 44 | return sample[sidx] 45 | 46 | while True: 47 | req = queue.get() 48 | if req is None: 49 | break 50 | 51 | idx, x = req[0], req[1] 52 | print pid, '-', idx 53 | seq = _translate(x) 54 | 55 | rqueue.put((idx, seq)) 56 | 57 | return 58 | 59 | 60 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5, 61 | normalize=False, n_process=5, chr_level=False): 62 | 63 | # load model model_options 64 | with open('%s.pkl' % model, 'rb') as f: 65 | options = pkl.load(f) 66 | 67 | # load source dictionary and invert 68 | with open(dictionary, 'rb') as f: 69 | word_dict = pkl.load(f) 70 | word_idict = dict() 71 | for kk, vv in word_dict.iteritems(): 72 | word_idict[vv] = kk 73 | word_idict[0] = '' 74 | word_idict[1] = 'UNK' 75 | 76 | # load target dictionary and invert 77 | with open(dictionary_target, 'rb') as f: 78 | word_dict_trg = pkl.load(f) 79 | word_idict_trg = dict() 80 | for kk, vv in word_dict_trg.iteritems(): 81 | word_idict_trg[vv] = kk 82 | word_idict_trg[0] = '' 83 | word_idict_trg[1] = 'UNK' 84 | 85 | # create input and output queues for processes 86 | queue = Queue() 87 | rqueue = Queue() 88 | processes = [None] * n_process 89 | for midx in xrange(n_process): 90 | processes[midx] = Process( 91 | target=translate_model, 92 | args=(queue, rqueue, midx, model, options, k, normalize,)) 93 | processes[midx].start() 94 | 95 | # utility function 96 | def _seqs2words(caps): 97 | capsw = [] 98 | for cc in caps: 99 | ww = [] 100 | for w in cc: 101 | if w == 0: 102 | break 103 | ww.append(word_idict_trg[w]) 104 | capsw.append(' '.join(ww)) 105 | return capsw 106 | 107 | def _send_jobs(fname): 108 | with open(fname, 'r') as f: 109 | for idx, line in enumerate(f): 110 | if chr_level: 111 | words = list(line.decode('utf-8').strip()) 112 | else: 113 | words = line.strip().split() 114 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 115 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 116 | x += [0] 117 | queue.put((idx, x)) 118 | return idx+1 119 | 120 | def _finish_processes(): 121 | for midx in xrange(n_process): 122 | queue.put(None) 123 | 124 | def _retrieve_jobs(n_samples): 125 | trans = [None] * n_samples 126 | for idx in xrange(n_samples): 127 | resp = rqueue.get() 128 | trans[resp[0]] = resp[1] 129 | if numpy.mod(idx, 10) == 0: 130 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 131 | return trans 132 | 133 | print 'Translating ', source_file, '...' 134 | n_samples = _send_jobs(source_file) 135 | trans = _seqs2words(_retrieve_jobs(n_samples)) 136 | _finish_processes() 137 | with open(saveto, 'w') as f: 138 | print >>f, '\n'.join(trans) 139 | print 'Done' 140 | 141 | 142 | if __name__ == "__main__": 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument('-k', type=int, default=5) 145 | parser.add_argument('-p', type=int, default=5) 146 | parser.add_argument('-n', action="store_true", default=False) 147 | parser.add_argument('-c', action="store_true", default=False) 148 | parser.add_argument('model', type=str) 149 | parser.add_argument('dictionary', type=str) 150 | parser.add_argument('dictionary_target', type=str) 151 | parser.add_argument('source', type=str) 152 | parser.add_argument('saveto', type=str) 153 | 154 | args = parser.parse_args() 155 | 156 | main(args.model, args.dictionary, args.dictionary_target, args.source, 157 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 158 | chr_level=args.c) 159 | -------------------------------------------------------------------------------- /session2/README.md: -------------------------------------------------------------------------------- 1 | Attention-based encoder-decoder model for machine translation 2 | 3 | ## Training 4 | Change the hard-coded paths to data in `nmt.py` then run 5 | ``` 6 | THEANO_FLAGS=device=gpu,floatX=float32 python train_nmt.py 7 | ``` 8 | -------------------------------------------------------------------------------- /session2/data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import cPickle as pkl 4 | import gzip 5 | 6 | 7 | def fopen(filename, mode='r'): 8 | if filename.endswith('.gz'): 9 | return gzip.open(filename, mode) 10 | return open(filename, mode) 11 | 12 | 13 | class TextIterator: 14 | """Simple Bitext iterator.""" 15 | def __init__(self, source, target, 16 | source_dict, target_dict, 17 | batch_size=128, 18 | maxlen=100, 19 | n_words_source=-1, 20 | n_words_target=-1): 21 | self.source = fopen(source, 'r') 22 | self.target = fopen(target, 'r') 23 | with open(source_dict, 'rb') as f: 24 | self.source_dict = pkl.load(f) 25 | with open(target_dict, 'rb') as f: 26 | self.target_dict = pkl.load(f) 27 | 28 | self.batch_size = batch_size 29 | self.maxlen = maxlen 30 | 31 | self.n_words_source = n_words_source 32 | self.n_words_target = n_words_target 33 | 34 | self.source_buffer = [] 35 | self.target_buffer = [] 36 | self.k = batch_size * 20 37 | 38 | self.end_of_data = False 39 | 40 | def __iter__(self): 41 | return self 42 | 43 | def reset(self): 44 | self.source.seek(0) 45 | self.target.seek(0) 46 | 47 | def next(self): 48 | if self.end_of_data: 49 | self.end_of_data = False 50 | self.reset() 51 | raise StopIteration 52 | 53 | source = [] 54 | target = [] 55 | 56 | # fill buffer, if it's empty 57 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 58 | 59 | if len(self.source_buffer) == 0: 60 | for k_ in xrange(self.k): 61 | ss = self.source.readline() 62 | if ss == "": 63 | break 64 | tt = self.target.readline() 65 | if tt == "": 66 | break 67 | 68 | self.source_buffer.append(ss.strip().split()) 69 | self.target_buffer.append(tt.strip().split()) 70 | 71 | # sort by target buffer 72 | tlen = numpy.array([len(t) for t in self.target_buffer]) 73 | tidx = tlen.argsort() 74 | 75 | _sbuf = [self.source_buffer[i] for i in tidx] 76 | _tbuf = [self.target_buffer[i] for i in tidx] 77 | 78 | self.source_buffer = _sbuf 79 | self.target_buffer = _tbuf 80 | 81 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 82 | self.end_of_data = False 83 | self.reset() 84 | raise StopIteration 85 | 86 | try: 87 | 88 | # actual work here 89 | while True: 90 | 91 | # read from source file and map to word index 92 | try: 93 | ss = self.source_buffer.pop() 94 | except IndexError: 95 | break 96 | ss = [self.source_dict[w] if w in self.source_dict else 1 97 | for w in ss] 98 | if self.n_words_source > 0: 99 | ss = [w if w < self.n_words_source else 1 for w in ss] 100 | 101 | # read from source file and map to word index 102 | tt = self.target_buffer.pop() 103 | tt = [self.target_dict[w] if w in self.target_dict else 1 104 | for w in tt] 105 | if self.n_words_target > 0: 106 | tt = [w if w < self.n_words_target else 1 for w in tt] 107 | 108 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 109 | continue 110 | 111 | source.append(ss) 112 | target.append(tt) 113 | 114 | if len(source) >= self.batch_size or \ 115 | len(target) >= self.batch_size: 116 | break 117 | except IOError: 118 | self.end_of_data = True 119 | 120 | if len(source) <= 0 or len(target) <= 0: 121 | self.end_of_data = False 122 | self.reset() 123 | raise StopIteration 124 | 125 | return source, target 126 | -------------------------------------------------------------------------------- /session2/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./translate.py -n -p 8 \ 12 | $HOME/models/model_session2.npz \ 13 | $HOME/data/europarl-v7.fr-en.en.tok.pkl \ 14 | $HOME/data/europarl-v7.fr-en.fr.tok.pkl \ 15 | $HOME/data/newstest2011.en.tok \ 16 | ./newstest2011.trans.fr.tok 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /session2/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /session2/train_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt_all.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /session2/train_nmt.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | import numpy 5 | import os 6 | 7 | from nmt import train 8 | 9 | def main(job_id, params): 10 | print params 11 | validerr = train(saveto=params['model'][0], 12 | reload_=params['reload'][0], 13 | dim_word=params['dim_word'][0], 14 | dim=params['dim'][0], 15 | n_words=params['n-words'][0], 16 | n_words_src=params['n-words'][0], 17 | decay_c=params['decay-c'][0], 18 | clip_c=params['clip-c'][0], 19 | lrate=params['learning-rate'][0], 20 | optimizer=params['optimizer'][0], 21 | patience=1000, 22 | maxlen=50, 23 | batch_size=32, 24 | valid_batch_size=32, 25 | validFreq=100, 26 | dispFreq=10, 27 | saveFreq=100, 28 | sampleFreq=100, 29 | datasets=['../data/hal/train/tok/en', 30 | '../data/hal/train/tok/fr'], 31 | valid_datasets=['../data/hal/dev/tok/en', 32 | '../data/hal/dev/tok/fr'], 33 | dictionaries=['../data/hal/train/tok/en.pkl', 34 | '../data/hal/train/tok/fr.pkl'], 35 | use_dropout=params['use-dropout'][0], 36 | overwrite=False) 37 | return validerr 38 | 39 | if __name__ == '__main__': 40 | main(0, { 41 | 'model': ['model_hal.npz'], 42 | 'dim_word': [512], 43 | 'dim': [1024], 44 | 'n-words': [30000], 45 | 'optimizer': ['adadelta'], 46 | 'decay-c': [0.], 47 | 'clip-c': [1.], 48 | 'use-dropout': [False], 49 | 'learning-rate': [0.0001], 50 | 'reload': [True]}) 51 | -------------------------------------------------------------------------------- /session2/train_nmt_all.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from nmt import train 5 | 6 | def main(job_id, params): 7 | print params 8 | validerr = train(saveto=params['model'][0], 9 | reload_=params['reload'][0], 10 | dim_word=params['dim_word'][0], 11 | dim=params['dim'][0], 12 | n_words=params['n-words'][0], 13 | n_words_src=params['n-words'][0], 14 | decay_c=params['decay-c'][0], 15 | clip_c=params['clip-c'][0], 16 | lrate=params['learning-rate'][0], 17 | optimizer=params['optimizer'][0], 18 | maxlen=50, 19 | batch_size=32, 20 | valid_batch_size=32, 21 | datasets=['/ichec/home/users/%s/data/all.en.concat.shuf.gz'%os.environ['USER'], 22 | '/ichec/home/users/%s/data/all.fr.concat.shuf.gz'%os.environ['USER']], 23 | valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'], 24 | '/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']], 25 | dictionaries=['/ichec/home/users/%s/data/all.en.concat.gz.pkl'%os.environ['USER'], 26 | '/ichec/home/users/%s/data/all.fr.concat.gz.pkl'%os.environ['USER']], 27 | validFreq=5000, 28 | dispFreq=10, 29 | saveFreq=5000, 30 | sampleFreq=1000, 31 | use_dropout=params['use-dropout'][0], 32 | overwrite=False) 33 | return validerr 34 | 35 | if __name__ == '__main__': 36 | main(0, { 37 | 'model': ['/ichec/home/users/%s/models/model_session2_all.npz'%os.environ['USER']], 38 | 'dim_word': [500], 39 | 'dim': [1024], 40 | 'n-words': [30000], 41 | 'optimizer': ['adadelta'], 42 | 'decay-c': [0.], 43 | 'clip-c': [1.], 44 | 'use-dropout': [False], 45 | 'learning-rate': [0.0001], 46 | 'reload': [False]}) 47 | 48 | 49 | -------------------------------------------------------------------------------- /session2/translate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import cPickle as pkl 8 | 9 | from nmt import (build_sampler, gen_sample, load_params, 10 | init_params, init_tparams) 11 | 12 | from multiprocessing import Process, Queue 13 | 14 | 15 | def translate_model(queue, rqueue, pid, model, options, k, normalize): 16 | 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | from theano import shared 19 | trng = RandomStreams(1234) 20 | use_noise = shared(numpy.float32(0.)) 21 | 22 | # allocate model parameters 23 | params = init_params(options) 24 | 25 | # load model parameters and set theano shared variables 26 | params = load_params(model, params) 27 | tparams = init_tparams(params) 28 | 29 | # word index 30 | f_init, f_next = build_sampler(tparams, options, trng, use_noise) 31 | 32 | def _translate(seq): 33 | # sample given an input sequence and obtain scores 34 | sample, score = gen_sample(tparams, f_init, f_next, 35 | numpy.array(seq).reshape([len(seq), 1]), 36 | options, trng=trng, k=k, maxlen=200, 37 | stochastic=False, argmax=False) 38 | 39 | # normalize scores according to sequence lengths 40 | if normalize: 41 | lengths = numpy.array([len(s) for s in sample]) 42 | score = score / lengths 43 | sidx = numpy.argmin(score) 44 | return sample[sidx] 45 | 46 | while True: 47 | req = queue.get() 48 | if req is None: 49 | break 50 | 51 | idx, x = req[0], req[1] 52 | print pid, '-', idx 53 | seq = _translate(x) 54 | 55 | rqueue.put((idx, seq)) 56 | 57 | return 58 | 59 | 60 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5, 61 | normalize=False, n_process=5, chr_level=False): 62 | 63 | # load model model_options 64 | with open('%s.pkl' % model, 'rb') as f: 65 | options = pkl.load(f) 66 | 67 | # load source dictionary and invert 68 | with open(dictionary, 'rb') as f: 69 | word_dict = pkl.load(f) 70 | word_idict = dict() 71 | for kk, vv in word_dict.iteritems(): 72 | word_idict[vv] = kk 73 | word_idict[0] = '' 74 | word_idict[1] = 'UNK' 75 | 76 | # load target dictionary and invert 77 | with open(dictionary_target, 'rb') as f: 78 | word_dict_trg = pkl.load(f) 79 | word_idict_trg = dict() 80 | for kk, vv in word_dict_trg.iteritems(): 81 | word_idict_trg[vv] = kk 82 | word_idict_trg[0] = '' 83 | word_idict_trg[1] = 'UNK' 84 | 85 | # create input and output queues for processes 86 | queue = Queue() 87 | rqueue = Queue() 88 | processes = [None] * n_process 89 | for midx in xrange(n_process): 90 | processes[midx] = Process( 91 | target=translate_model, 92 | args=(queue, rqueue, midx, model, options, k, normalize)) 93 | processes[midx].start() 94 | 95 | # utility function 96 | def _seqs2words(caps): 97 | capsw = [] 98 | for cc in caps: 99 | ww = [] 100 | for w in cc: 101 | if w == 0: 102 | break 103 | ww.append(word_idict_trg[w]) 104 | capsw.append(' '.join(ww)) 105 | return capsw 106 | 107 | def _send_jobs(fname): 108 | with open(fname, 'r') as f: 109 | for idx, line in enumerate(f): 110 | if chr_level: 111 | words = list(line.decode('utf-8').strip()) 112 | else: 113 | words = line.strip().split() 114 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 115 | x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) 116 | x += [0] 117 | queue.put((idx, x)) 118 | return idx+1 119 | 120 | def _finish_processes(): 121 | for midx in xrange(n_process): 122 | queue.put(None) 123 | 124 | def _retrieve_jobs(n_samples): 125 | trans = [None] * n_samples 126 | for idx in xrange(n_samples): 127 | resp = rqueue.get() 128 | trans[resp[0]] = resp[1] 129 | if numpy.mod(idx, 10) == 0: 130 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 131 | return trans 132 | 133 | print 'Translating ', source_file, '...' 134 | n_samples = _send_jobs(source_file) 135 | trans = _seqs2words(_retrieve_jobs(n_samples)) 136 | _finish_processes() 137 | with open(saveto, 'w') as f: 138 | print >>f, '\n'.join(trans) 139 | print 'Done' 140 | 141 | 142 | if __name__ == "__main__": 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument('-k', type=int, default=5) 145 | parser.add_argument('-p', type=int, default=5) 146 | parser.add_argument('-n', action="store_true", default=False) 147 | parser.add_argument('-c', action="store_true", default=False) 148 | parser.add_argument('model', type=str) 149 | parser.add_argument('dictionary', type=str) 150 | parser.add_argument('dictionary_target', type=str) 151 | parser.add_argument('source', type=str) 152 | parser.add_argument('saveto', type=str) 153 | 154 | args = parser.parse_args() 155 | 156 | main(args.model, args.dictionary, args.dictionary_target, args.source, 157 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 158 | chr_level=args.c) 159 | -------------------------------------------------------------------------------- /session3/README.md: -------------------------------------------------------------------------------- 1 | Rescoring Attention-based encoder-decoder model using RNN Language Model 2 | 3 | ## Training 4 | Change the hard-coded paths to data in `nmt.py` then run 5 | ``` 6 | THEANO_FLAGS=device=gpu,floatX=float32 python train_nmt.py 7 | ``` 8 | -------------------------------------------------------------------------------- /session3/data_iterator.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import gzip 3 | 4 | 5 | def fopen(filename, mode='r'): 6 | if filename.endswith('.gz'): 7 | return gzip.open(filename, mode) 8 | return open(filename, mode) 9 | 10 | 11 | class TextIterator: 12 | """Simple Bitext iterator.""" 13 | def __init__(self, source, target, 14 | source_dict, target_dict, 15 | batch_size=128, 16 | maxlen=100, 17 | n_words_source=-1, 18 | n_words_target=-1): 19 | self.source = fopen(source, 'r') 20 | self.target = fopen(target, 'r') 21 | with open(source_dict, 'rb') as f: 22 | self.source_dict = pkl.load(f) 23 | with open(target_dict, 'rb') as f: 24 | self.target_dict = pkl.load(f) 25 | 26 | self.batch_size = batch_size 27 | self.maxlen = maxlen 28 | 29 | self.n_words_source = n_words_source 30 | self.n_words_target = n_words_target 31 | 32 | self.end_of_data = False 33 | 34 | def __iter__(self): 35 | return self 36 | 37 | def reset(self): 38 | self.source.seek(0) 39 | self.target.seek(0) 40 | 41 | def next(self): 42 | if self.end_of_data: 43 | self.end_of_data = False 44 | self.reset() 45 | raise StopIteration 46 | 47 | source = [] 48 | target = [] 49 | 50 | try: 51 | 52 | # actual work here 53 | while True: 54 | 55 | # read from source file and map to word index 56 | ss = self.source.readline() 57 | if ss == "": 58 | raise IOError 59 | ss = ss.strip().split() 60 | ss = [self.source_dict[w] if w in self.source_dict else 1 61 | for w in ss] 62 | if self.n_words_source > 0: 63 | ss = [w if w < self.n_words_source else 1 for w in ss] 64 | 65 | # read from source file and map to word index 66 | tt = self.target.readline() 67 | if tt == "": 68 | raise IOError 69 | tt = tt.strip().split() 70 | tt = [self.target_dict[w] if w in self.target_dict else 1 71 | for w in tt] 72 | if self.n_words_target > 0: 73 | tt = [w if w < self.n_words_target else 1 for w in tt] 74 | 75 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 76 | continue 77 | 78 | source.append(ss) 79 | target.append(tt) 80 | 81 | if len(source) >= self.batch_size or \ 82 | len(target) >= self.batch_size: 83 | break 84 | except IOError: 85 | self.end_of_data = True 86 | 87 | if len(source) <= 0 or len(target) <= 0: 88 | self.end_of_data = False 89 | self.reset() 90 | raise StopIteration 91 | 92 | return source, target 93 | -------------------------------------------------------------------------------- /session3/rescore_with_lm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cPickle as pkl 3 | import numpy 4 | import theano 5 | 6 | from collections import OrderedDict 7 | from theano import tensor 8 | 9 | import lm 10 | 11 | profile = False 12 | 13 | 14 | # load parameters 15 | def load_params(path): 16 | pp = numpy.load(path) 17 | params = OrderedDict() 18 | for kk, vv in pp.iteritems(): 19 | params[kk] = vv 20 | 21 | return params 22 | 23 | 24 | # Loads a pickled dictionary 25 | def load_dictionary(filename): 26 | with open(filename, 'rb') as f: 27 | word_dict = pkl.load(f) 28 | return word_dict 29 | 30 | 31 | # Inverts a dictionary and ensures special tokens 32 | def invert_dictionary(word_dict): 33 | word_idict = dict() 34 | for kk, vv in word_dict.iteritems(): 35 | word_idict[vv] = kk 36 | word_idict[0] = '' 37 | word_idict[1] = 'UNK' 38 | return word_idict 39 | 40 | 41 | # initialize Theano shared variables according to the initial parameters 42 | def init_tparams(params): 43 | tparams = OrderedDict() 44 | for kk, pp in params.iteritems(): 45 | tparams[kk] = theano.shared(params[kk], name=kk) 46 | return tparams 47 | 48 | 49 | # layers: 'name': ('parameter initializer', 'feedforward') 50 | layers = {'ff': ('lm.param_init_fflayer', 'lm.fflayer'), 51 | 'gru': ('lm.param_init_gru', 'lm.gru_layer'), 52 | } 53 | 54 | 55 | # Utility function to get layer props 56 | def get_layer(name): 57 | fns = layers[name] 58 | return (eval(fns[0]), eval(fns[1])) 59 | 60 | 61 | # build a sampler 62 | def build_sampler(tparams, options): 63 | # x: 1 x 1 64 | y = tensor.vector('y_sampler', dtype='int64') 65 | init_state = tensor.matrix('init_state', dtype='float32') 66 | 67 | # if it's the first word, emb should be all zero 68 | emb = tensor.switch(y[:, None] < 0, 69 | tensor.alloc(0., 1, tparams['Wemb'].shape[1]), 70 | tparams['Wemb'][y]) 71 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 72 | prefix='encoder', 73 | mask=None, 74 | one_step=True, 75 | init_state=init_state) 76 | next_state = proj[0] 77 | 78 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 79 | prefix='ff_logit_lstm', activ='linear') 80 | logit_prev = get_layer('ff')[1](tparams, emb, options, 81 | prefix='ff_logit_prev', activ='linear') 82 | logit = tensor.tanh(logit_lstm+logit_prev) 83 | logit = get_layer('ff')[1](tparams, logit, options, 84 | prefix='ff_logit', activ='linear') 85 | next_probs = tensor.nnet.softmax(logit) 86 | 87 | # next word probability 88 | print 'Building f_next..', 89 | inps = [y, init_state] 90 | outs = [next_probs, next_state] 91 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 92 | print 'Done' 93 | 94 | return f_next 95 | 96 | 97 | # Scores a given sequence with the language model 98 | def score_seq(seq, f_next, options, normalize): 99 | 100 | next_w = -1 * numpy.ones((1,)).astype('int64') 101 | next_state = numpy.zeros((1, options['dim'])).astype('float32') 102 | 103 | seq_len = len(seq) 104 | sample_score = 0 105 | for ii in xrange(seq_len): 106 | inps = [next_w, next_state] 107 | ret = f_next(*inps) 108 | next_p, next_state = ret[0], ret[1] 109 | 110 | # accumulate nll for each token 111 | sample_score -= numpy.log(next_p[0, seq[ii]]) 112 | 113 | if normalize: 114 | sample_score /= seq_len 115 | 116 | return sample_score 117 | 118 | 119 | # Linearly interpolate between two scores using beta 120 | def shallow_fusion(score_lm, score_tm, beta, convex_comb): 121 | if convex_comb: 122 | return (1 - beta) * score_tm + (beta * score_lm) 123 | return score_tm + (beta * score_lm) 124 | 125 | 126 | def main(model, model_options, dictionary_lm, dictionary_tm, 127 | source, saveto, normalize=False, chr_level=False, 128 | beta=0.5, convex_comb=False): 129 | 130 | # load model options 131 | model_options = pkl.load(open(model_options)) 132 | 133 | # reload parameters 134 | print 'Loading language model..', 135 | params = load_params(model) 136 | tparams = init_tparams(params) 137 | print 'Done' 138 | 139 | print 'Loading LM dictionary..', 140 | word_dict_lm = load_dictionary(dictionary_lm) 141 | print 'Done' 142 | 143 | print 'Loading TM dictionary..', 144 | word_dict_tm = load_dictionary(dictionary_tm) 145 | word_idict_tm = invert_dictionary(word_dict_tm) 146 | print 'Done' 147 | 148 | f_next = build_sampler(tparams, model_options) 149 | 150 | # Create a cross dictionary from tm to lm 151 | tm2lm_idx = {} 152 | for idx, word in word_idict_tm.items(): 153 | tm2lm_idx[idx] = word_dict_lm.get(word, 1) 154 | tm2lm_idx[0] = 0 # 155 | tm2lm_idx[1] = 1 # UNK 156 | 157 | # Iterate over the n-best list generated by TM 158 | print 'Rescoring..', 159 | new_trans = [] 160 | nbest_idx = 0 161 | with open(source, 'r') as f: 162 | scores_in_nbest = [] 163 | trans_in_nbest = [] 164 | for idx, line in enumerate(f): 165 | line_idx, trans, score_tm = line.strip().split('|||') 166 | if chr_level: 167 | words = list(trans.decode('utf-8').strip()) 168 | else: 169 | words = line.strip().split() 170 | x = map(lambda w: word_dict_tm[w] 171 | if w in word_dict_tm else 1, words) 172 | x = map(lambda ii: ii if ii < model_options['n_words'] else 1, x) 173 | x += [0] 174 | 175 | # Score the sequence with LM 176 | x_lm = [tm2lm_idx[xx] for xx in x] 177 | score_lm = score_seq(x_lm, f_next, model_options, normalize) 178 | 179 | # Take linear interpolation with Beta 180 | new_score = shallow_fusion(score_lm, float(score_tm), 181 | beta, convex_comb) 182 | if int(line_idx) > nbest_idx: 183 | new_trans.append( 184 | trans_in_nbest[numpy.argmin(scores_in_nbest)]) 185 | 186 | scores_in_nbest = [] 187 | trans_in_nbest = [] 188 | nbest_idx += 1 189 | else: 190 | scores_in_nbest.append(new_score) 191 | trans_in_nbest.append(trans) 192 | 193 | print 'Done' 194 | print 'Saving to %s' % saveto 195 | with open(saveto, 'w') as f: 196 | print >>f, '\n'.join(new_trans) 197 | print 'Done' 198 | return 199 | 200 | 201 | if __name__ == "__main__": 202 | parser = argparse.ArgumentParser() 203 | parser.add_argument('-b', '--beta', type=float, default=1., 204 | help="Weight for language model score") 205 | parser.add_argument('-n', action="store_true", default=False, 206 | help="Normalize wrt sequence length") 207 | parser.add_argument('-c', action="store_true", default=False, 208 | help="Character level") 209 | parser.add_argument('-x', action="store_true", default=False, 210 | help="Take convex combination using beta") 211 | parser.add_argument('model', type=str) 212 | parser.add_argument('model_options', type=str) 213 | parser.add_argument('dictionary_lm', type=str, 214 | help='Dictionary of language model') 215 | parser.add_argument('dictionary_tm', type=str, 216 | help='Target side dictionary of translation model') 217 | parser.add_argument('source', type=str) 218 | parser.add_argument('saveto', type=str) 219 | 220 | args = parser.parse_args() 221 | 222 | main(args.model, args.model_options, 223 | args.dictionary_lm, args.dictionary_tm, 224 | args.source, args.saveto, 225 | normalize=args.n, chr_level=args.c, beta=args.beta, 226 | convex_comb=args.x) 227 | -------------------------------------------------------------------------------- /session3/score_nbest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=48:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./rescore_with_lm.py -n -b 0.5 \ 12 | ${HOME}/models/model_session0.npz \ 13 | ${HOME}/models/model_session0.npz.pkl \ 14 | ${HOME}/data/wiki.tok.txt.gz.pkl \ 15 | ${HOME}/data/europarl-v7.fr-en.en.tok.pkl \ 16 | ./newstest2011.trans.en.tok \ 17 | ./newstest2011.trans.en.tok.rescored 18 | 19 | -------------------------------------------------------------------------------- /session3/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=4:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./translate.py -n -p 1 -b 3 \ 12 | ${HOME}/models/model_session3.npz \ 13 | ${HOME}/data/europarl-v7.fr-en.fr.tok.pkl \ 14 | ${HOME}/data/europarl-v7.fr-en.en.tok.pkl \ 15 | ${HOME}/newstest2011.en.tok \ 16 | ./newstest2011.trans.en.tok 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /session3/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session1_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | 9 | export THEANO_FLAGS=device=gpu,floatX=float32 10 | 11 | cd $PBS_O_WORKDIR 12 | python ./train_nmt.py 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /session3/train_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt_all.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /session3/train_nmt.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from nmt import train 5 | 6 | def main(job_id, params): 7 | print params 8 | basedir = '/data/lisatmp3/firatorh/nmt/europarlv7' 9 | validerr = train(saveto=params['model'][0], 10 | reload_=params['reload'][0], 11 | dim_word=params['dim_word'][0], 12 | dim=params['dim'][0], 13 | n_words=params['n-words'][0], 14 | n_words_src=params['n-words'][0], 15 | decay_c=params['decay-c'][0], 16 | clip_c=params['clip-c'][0], 17 | lrate=params['learning-rate'][0], 18 | optimizer=params['optimizer'][0], 19 | maxlen=15, 20 | batch_size=32, 21 | valid_batch_size=32, 22 | datasets=['%s/europarl-v7.fr-en.fr.tok'%basedir, 23 | '%s/europarl-v7.fr-en.en.tok'%basedir], 24 | valid_datasets=['%s/newstest2011.fr.tok'%basedir, 25 | '%s/newstest2011.en.tok'%basedir], 26 | dictionaries=['%s/europarl-v7.fr-en.fr.tok.pkl'%basedir, 27 | '%s/europarl-v7.fr-en.en.tok.pkl'%basedir], 28 | validFreq=500000, 29 | dispFreq=1, 30 | saveFreq=100, 31 | sampleFreq=50, 32 | use_dropout=params['use-dropout'][0], 33 | overwrite=False) 34 | return validerr 35 | 36 | if __name__ == '__main__': 37 | basedir = '/data/lisatmp3/firatorh/nmt/europarlv7' 38 | main(0, { 39 | 'model': ['%s/models/model_session3.npz'%basedir], 40 | 'dim_word': [150], 41 | 'dim': [124], 42 | 'n-words': [3000], 43 | 'optimizer': ['adadelta'], 44 | 'decay-c': [0.], 45 | 'clip-c': [1.], 46 | 'use-dropout': [False], 47 | 'learning-rate': [0.0001], 48 | 'reload': [False]}) 49 | 50 | 51 | -------------------------------------------------------------------------------- /session3/train_nmt_all.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from nmt import train 5 | 6 | def main(job_id, params): 7 | print params 8 | validerr = train(saveto=params['model'][0], 9 | reload_=params['reload'][0], 10 | dim_word=params['dim_word'][0], 11 | dim=params['dim'][0], 12 | n_words=params['n-words'][0], 13 | n_words_src=params['n-words'][0], 14 | decay_c=params['decay-c'][0], 15 | clip_c=params['clip-c'][0], 16 | lrate=params['learning-rate'][0], 17 | optimizer=params['optimizer'][0], 18 | maxlen=50, 19 | batch_size=32, 20 | valid_batch_size=32, 21 | datasets=['/ichec/home/users/%s/data/all.en.concat.shuf.gz'%os.environ['USER'], 22 | '/ichec/home/users/%s/data/all.fr.concat.shuf.gz'%os.environ['USER']], 23 | valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'], 24 | '/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']], 25 | dictionaries=['/ichec/home/users/%s/data/all.en.concat.gz.pkl'%os.environ['USER'], 26 | '/ichec/home/users/%s/data/all.fr.concat.gz.pkl'%os.environ['USER']], 27 | validFreq=5000, 28 | dispFreq=10, 29 | saveFreq=5000, 30 | sampleFreq=1000, 31 | use_dropout=params['use-dropout'][0], 32 | overwrite=False) 33 | return validerr 34 | 35 | if __name__ == '__main__': 36 | main(0, { 37 | 'model': ['/ichec/home/users/%s/models/model_session2_all.npz'%os.environ['USER']], 38 | 'dim_word': [500], 39 | 'dim': [1024], 40 | 'n-words': [30000], 41 | 'optimizer': ['adadelta'], 42 | 'decay-c': [0.], 43 | 'clip-c': [1.], 44 | 'use-dropout': [False], 45 | 'learning-rate': [0.0001], 46 | 'reload': [False]}) 47 | 48 | 49 | -------------------------------------------------------------------------------- /session3/translate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import cPickle as pkl 8 | 9 | from nmt import (build_sampler, gen_sample, load_params, 10 | init_params, init_tparams) 11 | 12 | from multiprocessing import Process, Queue 13 | 14 | 15 | def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best): 16 | 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | from theano import shared 19 | trng = RandomStreams(1234) 20 | use_noise = shared(numpy.float32(0.)) 21 | 22 | # allocate model parameters 23 | params = init_params(options) 24 | 25 | # load model parameters and set theano shared variables 26 | params = load_params(model, params) 27 | tparams = init_tparams(params) 28 | 29 | # word index 30 | f_init, f_next = build_sampler(tparams, options, trng, use_noise) 31 | 32 | def _translate(seq): 33 | # sample given an input sequence and obtain scores 34 | sample, score = gen_sample(tparams, f_init, f_next, 35 | numpy.array(seq).reshape([len(seq), 1]), 36 | options, trng=trng, k=k, maxlen=200, 37 | stochastic=False, argmax=False) 38 | 39 | # normalize scores according to sequence lengths 40 | if normalize: 41 | lengths = numpy.array([len(s) for s in sample]) 42 | score = score / lengths 43 | if n_best > 1: 44 | sidx = numpy.argsort(score)[:n_best] 45 | else: 46 | sidx = numpy.argmin(score) 47 | return numpy.array(sample)[sidx], numpy.array(score)[sidx] 48 | 49 | while True: 50 | req = queue.get() 51 | if req is None: 52 | break 53 | 54 | idx, x = req[0], req[1] 55 | print pid, '-', idx 56 | seq, scores = _translate(x) 57 | 58 | rqueue.put((idx, seq, scores)) 59 | 60 | return 61 | 62 | 63 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5, 64 | normalize=False, n_process=5, chr_level=False, n_best=1): 65 | 66 | # load model model_options 67 | with open('%s.pkl' % model, 'rb') as f: 68 | options = pkl.load(f) 69 | 70 | # load source dictionary and invert 71 | with open(dictionary, 'rb') as f: 72 | word_dict = pkl.load(f) 73 | word_idict = dict() 74 | for kk, vv in word_dict.iteritems(): 75 | word_idict[vv] = kk 76 | word_idict[0] = '' 77 | word_idict[1] = 'UNK' 78 | 79 | # load target dictionary and invert 80 | with open(dictionary_target, 'rb') as f: 81 | word_dict_trg = pkl.load(f) 82 | word_idict_trg = dict() 83 | for kk, vv in word_dict_trg.iteritems(): 84 | word_idict_trg[vv] = kk 85 | word_idict_trg[0] = '' 86 | word_idict_trg[1] = 'UNK' 87 | 88 | # create input and output queues for processes 89 | queue = Queue() 90 | rqueue = Queue() 91 | processes = [None] * n_process 92 | for midx in xrange(n_process): 93 | processes[midx] = Process( 94 | target=translate_model, 95 | args=(queue, rqueue, midx, model, options, k, normalize, n_best)) 96 | processes[midx].start() 97 | 98 | # utility function 99 | def _seqs2words(caps): 100 | capsw = [] 101 | for cc in caps: 102 | ww = [] 103 | for w in cc: 104 | if w == 0: 105 | break 106 | ww.append(word_idict_trg[w]) 107 | capsw.append(' '.join(ww)) 108 | return capsw 109 | 110 | def _send_jobs(fname): 111 | with open(fname, 'r') as f: 112 | for idx, line in enumerate(f): 113 | if chr_level: 114 | words = list(line.decode('utf-8').strip()) 115 | else: 116 | words = line.strip().split() 117 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 118 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 119 | x += [0] 120 | queue.put((idx, x)) 121 | return idx+1 122 | 123 | def _finish_processes(): 124 | for midx in xrange(n_process): 125 | queue.put(None) 126 | 127 | def _retrieve_jobs(n_samples): 128 | trans = [None] * n_samples 129 | scores = [None] * n_samples 130 | for idx in xrange(n_samples): 131 | resp = rqueue.get() 132 | trans[resp[0]] = resp[1] 133 | scores[resp[0]] = resp[2] 134 | if numpy.mod(idx, 10) == 0: 135 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 136 | return trans, scores 137 | 138 | print 'Translating ', source_file, '...' 139 | n_samples = _send_jobs(source_file) 140 | trans, scores = _retrieve_jobs(n_samples) 141 | _finish_processes() 142 | 143 | if n_best == 1: 144 | trans = _seqs2words(trans) 145 | else: 146 | n_best_trans = [] 147 | for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)): 148 | sentences = _seqs2words(n_best_tr) 149 | for ids, trans_ in enumerate(sentences): 150 | n_best_trans.append( 151 | '|||'.join( 152 | ['{}'.format(idx), trans_, 153 | '{}'.format(score_[ids])])) 154 | trans = n_best_trans 155 | 156 | with open(saveto, 'w') as f: 157 | print >>f, '\n'.join(trans) 158 | print 'Done' 159 | 160 | 161 | if __name__ == "__main__": 162 | parser = argparse.ArgumentParser() 163 | parser.add_argument('-k', type=int, default=5, help="Beam size") 164 | parser.add_argument('-p', type=int, default=5, help="Number of processes") 165 | parser.add_argument('-n', action="store_true", default=False, 166 | help="Normalize wrt sequence length") 167 | parser.add_argument('-c', action="store_true", default=False, 168 | help="Character level") 169 | parser.add_argument('-b', type=int, default=1, help="Output n-best list") 170 | parser.add_argument('model', type=str) 171 | parser.add_argument('dictionary', type=str) 172 | parser.add_argument('dictionary_target', type=str) 173 | parser.add_argument('source', type=str) 174 | parser.add_argument('saveto', type=str) 175 | 176 | args = parser.parse_args() 177 | 178 | main(args.model, args.dictionary, args.dictionary_target, args.source, 179 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 180 | chr_level=args.c, n_best=args.b) 181 | --------------------------------------------------------------------------------