├── LICENSE
├── README.md
├── data
    ├── README.md
    ├── build_dictionary.py
    ├── extract_files.py
    ├── length.py
    ├── merge.sh
    ├── multi-bleu.perl
    ├── nonbreaking_prefixes
    │   ├── README.txt
    │   ├── nonbreaking_prefix.ca
    │   ├── nonbreaking_prefix.cs
    │   ├── nonbreaking_prefix.de
    │   ├── nonbreaking_prefix.el
    │   ├── nonbreaking_prefix.en
    │   ├── nonbreaking_prefix.es
    │   ├── nonbreaking_prefix.fi
    │   ├── nonbreaking_prefix.fr
    │   ├── nonbreaking_prefix.hu
    │   ├── nonbreaking_prefix.is
    │   ├── nonbreaking_prefix.it
    │   ├── nonbreaking_prefix.lv
    │   ├── nonbreaking_prefix.nl
    │   ├── nonbreaking_prefix.pl
    │   ├── nonbreaking_prefix.pt
    │   ├── nonbreaking_prefix.ro
    │   ├── nonbreaking_prefix.ru
    │   ├── nonbreaking_prefix.sk
    │   ├── nonbreaking_prefix.sl
    │   ├── nonbreaking_prefix.sv
    │   └── nonbreaking_prefix.ta
    ├── preprocess.sh
    ├── scan_example.py
    ├── setup_cluster_env.sh
    ├── setup_local_env.sh
    ├── shuffle.py
    ├── strip_sgml.py
    ├── tokenize_all.sh
    └── tokenizer.perl
├── docs
    ├── cgru.pdf
    └── cgru.tex
├── session0
    ├── data_iterator.py
    ├── lm.py
    ├── train.sh
    └── train_lm.py
├── session1
    ├── README.md
    ├── data_iterator.py
    ├── encode.py
    ├── nmt.py
    ├── test.sh
    ├── train.sh
    ├── train_all.sh
    ├── train_nmt.py
    ├── train_nmt_all.py
    └── translate.py
├── session2
    ├── README.md
    ├── data_iterator.py
    ├── nmt.py
    ├── test.sh
    ├── train.sh
    ├── train_all.sh
    ├── train_nmt.py
    ├── train_nmt_all.py
    └── translate.py
└── session3
    ├── README.md
    ├── data_iterator.py
    ├── lm.py
    ├── nmt.py
    ├── rescore_with_lm.py
    ├── score_nbest.sh
    ├── test.sh
    ├── train.sh
    ├── train_all.sh
    ├── train_nmt.py
    ├── train_nmt_all.py
    └── translate.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Orhan Firat and New York University (Kyunghyun Cho)
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of dl4mt-tutorial nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dl4mt-material
2 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | Data pre-processing related scripts and utilities.
 2 | 
 3 | #### Setup
 4 | Easiest way to setup your environment:
 5 | 
 6 | ```bash
 7 | $ cd ~; mkdir codes; cd codes
 8 | $ git clone https://github.com/nyu-dl/dl4mt-tutorial
 9 | $ cd dl4mt-tutorial/data
10 | $ ./setup_local_env.sh
11 | ```
12 | 
13 | which will first clone this repository under `~/codes/dl4mt-tutorial`
14 | and then calls the `setup_local_env.sh` script to retrieve example data,
15 | and preprocesses it.
16 | 
17 | #### Pre-processing
18 | Following steps are executed by `setup_local_env.sh`:
19 |  1. Clone `dl4mt-tutorial` repository (if not cloned already)
20 |  2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development)
21 |  3. Preprocess training and development sets
22 |    * Tokenize using moses tokenizer
23 |    * Shuffle training set for SGD
24 |    * Build source and target dictionaries
25 | 
26 | #### Pre-processing with subword-units
27 | If you want to use subword-units (eg. [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)) for source and target tokens, simply call:
28 | ```bash
29 | $ ./setup_local_env.sh -b
30 | ```
31 | which will replace the third step above, and execute the following steps:
32 |  1. Clone `dl4mt-tutorial` repository (if not cloned already)
33 |  2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development)
34 |  3. Preprocess training and development sets (`preprocess.sh`)
35 |    * Tokenize source and target side of all bitext
36 |    * Learn BPE-codes for both source and target side using training sets
37 |    * Encode source and target side using the learned codes
38 |    * Shuffle training set for SGD
39 |    * Build source and target dictionaries
40 |  
41 | In case you want to preprocess your own data using BPE, you can use `preprocess.sh` script directly.
42 | 
43 | For the usage and more details, please check the comments in the scripts.
44 | 


--------------------------------------------------------------------------------
/data/build_dictionary.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy
 4 | 
 5 | try:
 6 |     import cPickle as pkl
 7 | except:
 8 |     import pickle as pkl
 9 | 
10 | import sys
11 | import fileinput
12 | 
13 | from collections import OrderedDict
14 | 
15 | def main():
16 |     for filename in sys.argv[1:]:
17 |         print('Processing', filename)
18 |         word_freqs = OrderedDict()
19 |         with open(filename, 'r') as f:
20 |             for line in f:
21 |                 words_in = line.strip().split(' ')
22 |                 for w in words_in:
23 |                     if w not in word_freqs:
24 |                         word_freqs[w] = 0
25 |                     word_freqs[w] += 1
26 |         words = list(word_freqs.keys())
27 |         freqs = list(word_freqs.values())
28 | 
29 |         sorted_idx = numpy.argsort(freqs)
30 |         sorted_words = [words[ii] for ii in sorted_idx[::-1]]
31 | 
32 |         worddict = OrderedDict()
33 |         worddict['eos'] = 0
34 |         worddict['UNK'] = 1
35 |         for ii, ww in enumerate(sorted_words):
36 |             worddict[ww] = ii+2
37 | 
38 |         with open('%s.pkl'%filename, 'wb') as f:
39 |             pkl.dump(worddict, f)
40 | 
41 |         print('Done')
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/data/extract_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import tarfile
 7 | 
 8 | TRAIN_DATA_URL = 'http://www.statmt.org/europarl/v7/fr-en.tgz'
 9 | VALID_DATA_URL = 'http://matrix.statmt.org/test_sets/newstest2011.tgz'
10 | 
11 | parser = argparse.ArgumentParser(
12 |     description="""
13 | This script donwloads parallel corpora given source and target pair language
14 | indicators. Adapted from,
15 | https://github.com/orhanf/blocks-examples/tree/master/machine_translation
16 | """, formatter_class=argparse.RawTextHelpFormatter)
17 | parser.add_argument("-s", "--source", type=str, help="Source language",
18 |                     default="fr")
19 | parser.add_argument("-t", "--target", type=str, help="Target language",
20 |                     default="en")
21 | parser.add_argument("--source-dev", type=str, default="newstest2011.fr",
22 |                     help="Source language dev filename")
23 | parser.add_argument("--target-dev", type=str, default="newstest2011.en",
24 |                     help="Target language dev filename")
25 | parser.add_argument("--outdir", type=str, default=".",
26 |                     help="Output directory")
27 | 
28 | 
29 | def extract_tar_file_to(file_to_extract, extract_into, names_to_look):
30 |     extracted_filenames = []
31 |     try:
32 |         logger.info("Extracting file [{}] into [{}]"
33 |                     .format(file_to_extract, extract_into))
34 |         tar = tarfile.open(file_to_extract, 'r')
35 |         src_trg_files = [ff for ff in tar.getnames()
36 |                          if any([ff.find(nn) > -1 for nn in names_to_look])]
37 |         if not len(src_trg_files):
38 |             raise ValueError("[{}] pair does not exist in the archive!"
39 |                              .format(src_trg_files))
40 |         for item in tar:
41 |             # extract only source-target pair
42 |             if item.name in src_trg_files:
43 |                 file_path = os.path.join(extract_into, item.path)
44 |                 if not os.path.exists(file_path):
45 |                     logger.info("...extracting [{}] into [{}]"
46 |                                 .format(item.name, file_path))
47 |                     tar.extract(item, extract_into)
48 |                 else:
49 |                     logger.info("...file exists [{}]".format(file_path))
50 |                 extracted_filenames.append(
51 |                     os.path.join(extract_into, item.path))
52 |     except Exception as e:
53 |         logger.error("{}".format(str(e)))
54 |     return extracted_filenames
55 | 
56 | 
57 | def main():
58 |     train_data_file = os.path.join(args.outdir, 'train_data.tgz')
59 |     valid_data_file = os.path.join(args.outdir, 'valid_data.tgz')
60 | 
61 |     # Download europarl v7 and extract it
62 |     extract_tar_file_to(
63 |         train_data_file, os.path.dirname(train_data_file),
64 |         ["{}-{}".format(args.source, args.target)])
65 | 
66 |     # Download development set and extract it
67 |     extract_tar_file_to(
68 |         valid_data_file, os.path.dirname(valid_data_file),
69 |         [args.source_dev, args.target_dev])
70 | 
71 | 
72 | if __name__ == "__main__":
73 | 
74 |     logging.basicConfig(level=logging.INFO)
75 |     logger = logging.getLogger('prepare_data')
76 | 
77 |     args = parser.parse_args()
78 |     main()
79 | 


--------------------------------------------------------------------------------
/data/length.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy
 4 | import sys
 5 | 
 6 | for name in sys.argv[1:]:
 7 |     lens = []
 8 |     with open(name, 'r') as f:
 9 |         for ll in f:
10 |             lens.append(len(ll.strip().split(' ')))
11 |     print(name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens))
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/data/merge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script merges all the bitext files in the current directory. 
 3 | # Source side files are concatenated into all_[src]-[trg].[src]
 4 | # Target side files are concatenated into all_[src]-[trg].[trg] 
 5 | 
 6 | if [ "$#" -ne 3 ]; then
 7 |     echo ""
 8 |     echo "Usage: $0 src trg path_to_data"
 9 |     echo ""
10 |     exit 1
11 | fi
12 | 
13 | SRC=$1
14 | TRG=$2
15 | 
16 | DATA_DIR=$3
17 | 
18 | FSRC=${DATA_DIR}/all_${1}-${2}.${1}
19 | FTRG=${DATA_DIR}/all_${1}-${2}.${2}
20 | 
21 | echo "" > $FSRC
22 | for F in ${DATA_DIR}/*${1}-${2}.${1}
23 | do
24 |     if [ "$F" = "$FSRC" ]; then
25 |         echo "pass"
26 |     else
27 |         cat $F >> $FSRC
28 |     fi
29 | done
30 | 
31 | 
32 | echo "" > $FTRG
33 | for F in ${DATA_DIR}/*${1}-${2}.${2}
34 | do
35 |     if [ "$F" = "$FTRG" ]; then
36 |         echo "pass"
37 |     else
38 |         cat $F >> $FTRG
39 |     fi
40 | done
41 | 


--------------------------------------------------------------------------------
/data/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | sub add_to_ref {
 35 |     my ($file,$REF) = @_;
 36 |     my $s=0;
 37 |     open(REF,$file) or die "Can't read $file";
 38 |     while(<REF>) {
 39 | 	chop;
 40 | 	push @{$$REF[$s++]}, $_;
 41 |     }
 42 |     close(REF);
 43 | }
 44 | 
 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 46 | my $s=0;
 47 | while(<STDIN>) {
 48 |     chop;
 49 |     $_ = lc if $lowercase;
 50 |     my @WORD = split;
 51 |     my %REF_NGRAM = ();
 52 |     my $length_translation_this_sentence = scalar(@WORD);
 53 |     my ($closest_diff,$closest_length) = (9999,9999);
 54 |     foreach my $reference (@{$REF[$s]}) {
 55 | #      print "$s $_ <=> $reference\n";
 56 |   $reference = lc($reference) if $lowercase;
 57 | 	my @WORD = split(' ',$reference);
 58 | 	my $length = scalar(@WORD);
 59 |         my $diff = abs($length_translation_this_sentence-$length);
 60 | 	if ($diff < $closest_diff) {
 61 | 	    $closest_diff = $diff;
 62 | 	    $closest_length = $length;
 63 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 64 | 	} elsif ($diff == $closest_diff) {
 65 |             $closest_length = $length if $length < $closest_length;
 66 |             # from two references with the same closeness to me
 67 |             # take the *shorter* into account, not the "first" one.
 68 |         }
 69 | 	for(my $n=1;$n<=4;$n++) {
 70 | 	    my %REF_NGRAM_N = ();
 71 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 72 | 		my $ngram = "$n";
 73 | 		for(my $w=0;$w<$n;$w++) {
 74 | 		    $ngram .= " ".$WORD[$start+$w];
 75 | 		}
 76 | 		$REF_NGRAM_N{$ngram}++;
 77 | 	    }
 78 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 79 | 		if (!defined($REF_NGRAM{$ngram}) ||
 80 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 81 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 82 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 83 | 		}
 84 | 	    }
 85 | 	}
 86 |     }
 87 |     $length_translation += $length_translation_this_sentence;
 88 |     $length_reference += $closest_length;
 89 |     for(my $n=1;$n<=4;$n++) {
 90 | 	my %T_NGRAM = ();
 91 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 92 | 	    my $ngram = "$n";
 93 | 	    for(my $w=0;$w<$n;$w++) {
 94 | 		$ngram .= " ".$WORD[$start+$w];
 95 | 	    }
 96 | 	    $T_NGRAM{$ngram}++;
 97 | 	}
 98 | 	foreach my $ngram (keys %T_NGRAM) {
 99 | 	    $ngram =~ /^(\d+) /;
100 | 	    my $n = $1;
101 |             # my $corr = 0;
102 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
103 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
104 | 	    if (defined($REF_NGRAM{$ngram})) {
105 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
106 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
107 |                     # $corr =  $T_NGRAM{$ngram};
108 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
109 | 		}
110 | 		else {
111 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
112 |                     # $corr =  $REF_NGRAM{$ngram};
113 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
114 | 		}
115 | 	    }
116 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
117 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
118 | 	}
119 |     }
120 |     $s++;
121 | }
122 | my $brevity_penalty = 1;
123 | my $bleu = 0;
124 | 
125 | my @bleu=();
126 | 
127 | for(my $n=1;$n<=4;$n++) {
128 |   if (defined ($TOTAL[$n])){
129 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
130 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
131 |   }else{
132 |     $bleu[$n]=0;
133 |   }
134 | }
135 | 
136 | if ($length_reference==0){
137 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
138 |   exit(1);
139 | }
140 | 
141 | if ($length_translation<$length_reference) {
142 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
143 | }
144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
145 | 				my_log( $bleu[2] ) +
146 | 				my_log( $bleu[3] ) +
147 | 				my_log( $bleu[4] ) ) / 4) ;
148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
149 |     100*$bleu,
150 |     100*$bleu[1],
151 |     100*$bleu[2],
152 |     100*$bleu[3],
153 |     100*$bleu[4],
154 |     $brevity_penalty,
155 |     $length_translation / $length_reference,
156 |     $length_translation,
157 |     $length_reference;
158 | 
159 | sub my_log {
160 |   return -9999999999 unless $_[0];
161 |   return log($_[0]);
162 | }
163 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.el:
--------------------------------------------------------------------------------
   1 | # Sigle letters in upper-case are usually abbreviations of names
   2 | Α
   3 | Β
   4 | Γ
   5 | Δ
   6 | Ε
   7 | Ζ
   8 | Η
   9 | Θ
  10 | Ι
  11 | Κ
  12 | Λ
  13 | Μ
  14 | Ν
  15 | Ξ
  16 | Ο
  17 | Π
  18 | Ρ
  19 | Σ
  20 | Τ
  21 | Υ
  22 | Φ
  23 | Χ
  24 | Ψ
  25 | Ω
  26 | 
  27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content).
  28 | Άθαν
  29 | Έγχρ
  30 | Έκθ
  31 | Έσδ
  32 | Έφ
  33 | Όμ
  34 | Α΄Έσδρ
  35 | Α΄Έσδ
  36 | Α΄Βασ
  37 | Α΄Θεσ
  38 | Α΄Ιω
  39 | Α΄Κορινθ
  40 | Α΄Κορ
  41 | Α΄Μακκ
  42 | Α΄Μακ
  43 | Α΄Πέτρ
  44 | Α΄Πέτ
  45 | Α΄Παραλ
  46 | Α΄Πε
  47 | Α΄Σαμ
  48 | Α΄Τιμ
  49 | Α΄Χρον
  50 | Α΄Χρ
  51 | Α.Β.Α
  52 | Α.Β
  53 | Α.Ε
  54 | Α.Κ.Τ.Ο
  55 | Αέθλ
  56 | Αέτ
  57 | Αίλ.Δ
  58 | Αίλ.Τακτ
  59 | Αίσ
  60 | Αββακ
  61 | Αβυδ
  62 | Αβ
  63 | Αγάκλ
  64 | Αγάπ
  65 | Αγάπ.Αμαρτ.Σ
  66 | Αγάπ.Γεωπ
  67 | Αγαθάγγ
  68 | Αγαθήμ
  69 | Αγαθιν
  70 | Αγαθοκλ
  71 | Αγαθρχ
  72 | Αγαθ
  73 | Αγαθ.Ιστ
  74 | Αγαλλ
  75 | Αγαπητ
  76 | Αγγ
  77 | Αγησ
  78 | Αγλ
  79 | Αγορ.Κ
  80 | Αγρο.Κωδ
  81 | Αγρ.Εξ
  82 | Αγρ.Κ
  83 | Αγ.Γρ
  84 | Αδριαν
  85 | Αδρ
  86 | Αετ
  87 | Αθάν
  88 | Αθήν
  89 | Αθήν.Επιγρ
  90 | Αθήν.Επιτ
  91 | Αθήν.Ιατρ
  92 | Αθήν.Μηχ
  93 | Αθανάσ
  94 | Αθαν
  95 | Αθηνί
  96 | Αθηναγ
  97 | Αθηνόδ
  98 | Αθ
  99 | Αθ.Αρχ
 100 | Αιλ
 101 | Αιλ.Επιστ
 102 | Αιλ.ΖΙ
 103 | Αιλ.ΠΙ
 104 | Αιλ.απ
 105 | Αιμιλ
 106 | Αιν.Γαζ
 107 | Αιν.Τακτ
 108 | Αισχίν
 109 | Αισχίν.Επιστ
 110 | Αισχ
 111 | Αισχ.Αγαμ
 112 | Αισχ.Αγ
 113 | Αισχ.Αλ
 114 | Αισχ.Ελεγ
 115 | Αισχ.Επτ.Θ
 116 | Αισχ.Ευμ
 117 | Αισχ.Ικέτ
 118 | Αισχ.Ικ
 119 | Αισχ.Περσ
 120 | Αισχ.Προμ.Δεσμ
 121 | Αισχ.Πρ
 122 | Αισχ.Χοηφ
 123 | Αισχ.Χο
 124 | Αισχ.απ
 125 | ΑιτΕ
 126 | Αιτ
 127 | Αλκ
 128 | Αλχιας
 129 | Αμ.Π.Ο
 130 | Αμβ
 131 | Αμμών
 132 | Αμ.
 133 | Αν.Πειθ.Συμβ.Δικ
 134 | Ανακρ
 135 | Ανακ
 136 | Αναμν.Τόμ
 137 | Αναπλ
 138 | Ανδ
 139 | Ανθλγος
 140 | Ανθστης
 141 | Αντισθ
 142 | Ανχης
 143 | Αν
 144 | Αποκ
 145 | Απρ
 146 | Απόδ
 147 | Απόφ
 148 | Απόφ.Νομ
 149 | Απ
 150 | Απ.Δαπ
 151 | Απ.Διατ
 152 | Απ.Επιστ
 153 | Αριθ
 154 | Αριστοτ
 155 | Αριστοφ
 156 | Αριστοφ.Όρν
 157 | Αριστοφ.Αχ
 158 | Αριστοφ.Βάτρ
 159 | Αριστοφ.Ειρ
 160 | Αριστοφ.Εκκλ
 161 | Αριστοφ.Θεσμ
 162 | Αριστοφ.Ιππ
 163 | Αριστοφ.Λυσ
 164 | Αριστοφ.Νεφ
 165 | Αριστοφ.Πλ
 166 | Αριστοφ.Σφ
 167 | Αριστ
 168 | Αριστ.Αθ.Πολ
 169 | Αριστ.Αισθ
 170 | Αριστ.Αν.Πρ
 171 | Αριστ.Ζ.Ι
 172 | Αριστ.Ηθ.Ευδ
 173 | Αριστ.Ηθ.Νικ
 174 | Αριστ.Κατ
 175 | Αριστ.Μετ
 176 | Αριστ.Πολ
 177 | Αριστ.Φυσιογν
 178 | Αριστ.Φυσ
 179 | Αριστ.Ψυχ
 180 | Αριστ.Ρητ
 181 | Αρμεν
 182 | Αρμ
 183 | Αρχ.Εκ.Καν.Δ
 184 | Αρχ.Ευβ.Μελ
 185 | Αρχ.Ιδ.Δ
 186 | Αρχ.Νομ
 187 | Αρχ.Ν
 188 | Αρχ.Π.Ε
 189 | Αρ
 190 | Αρ.Φορ.Μητρ
 191 | Ασμ
 192 | Ασμ.ασμ
 193 | Αστ.Δ
 194 | Αστ.Χρον
 195 | Ασ
 196 | Ατομ.Γνωμ
 197 | Αυγ
 198 | Αφρ
 199 | Αχ.Νομ
 200 | Α
 201 | Α.Εγχ.Π
 202 | Α.Κ.΄Υδρας
 203 | Β΄Έσδρ
 204 | Β΄Έσδ
 205 | Β΄Βασ
 206 | Β΄Θεσ
 207 | Β΄Ιω
 208 | Β΄Κορινθ
 209 | Β΄Κορ
 210 | Β΄Μακκ
 211 | Β΄Μακ
 212 | Β΄Πέτρ
 213 | Β΄Πέτ
 214 | Β΄Πέ
 215 | Β΄Παραλ
 216 | Β΄Σαμ
 217 | Β΄Τιμ
 218 | Β΄Χρον
 219 | Β΄Χρ
 220 | Β.Ι.Π.Ε
 221 | Β.Κ.Τ
 222 | Β.Κ.Ψ.Β
 223 | Β.Μ
 224 | Β.Ο.Α.Κ
 225 | Β.Ο.Α
 226 | Β.Ο.Δ
 227 | Βίβλ
 228 | Βαρ
 229 | ΒεΘ
 230 | Βι.Περ
 231 | Βιπερ
 232 | Βιργ
 233 | Βλγ
 234 | Βούλ
 235 | Βρ
 236 | Γ΄Βασ
 237 | Γ΄Μακκ
 238 | ΓΕΝμλ
 239 | Γέν
 240 | Γαλ
 241 | Γεν
 242 | Γλ
 243 | Γν.Ν.Σ.Κρ
 244 | Γνωμ
 245 | Γν
 246 | Γράμμ
 247 | Γρηγ.Ναζ
 248 | Γρηγ.Νύσ
 249 | Γ Νοσ
 250 | Γ' Ογκολ
 251 | Γ.Ν
 252 | Δ΄Βασ
 253 | Δ.Β
 254 | Δ.Δίκη
 255 | Δ.Δίκ
 256 | Δ.Ε.Σ
 257 | Δ.Ε.Φ.Α
 258 | Δ.Ε.Φ
 259 | Δ.Εργ.Ν
 260 | Δαμ
 261 | Δαμ.μνημ.έργ
 262 | Δαν
 263 | Δασ.Κ
 264 | Δεκ
 265 | Δελτ.Δικ.Ε.Τ.Ε
 266 | Δελτ.Νομ
 267 | Δελτ.Συνδ.Α.Ε
 268 | Δερμ
 269 | Δευτ
 270 | Δεύτ
 271 | Δημοσθ
 272 | Δημόκρ
 273 | Δι.Δικ
 274 | Διάτ
 275 | Διαιτ.Απ
 276 | Διαιτ
 277 | Διαρκ.Στρατ
 278 | Δικ
 279 | Διοίκ.Πρωτ
 280 | ΔιοικΔνη
 281 | Διοικ.Εφ
 282 | Διον.Αρ
 283 | Διόρθ.Λαθ
 284 | Δ.κ.Π
 285 | Δνη
 286 | Δν
 287 | Δογμ.Όρος
 288 | Δρ
 289 | Δ.τ.Α
 290 | Δτ
 291 | ΔωδΝομ
 292 | Δ.Περ
 293 | Δ.Στρ
 294 | ΕΔΠολ
 295 | ΕΕυρΚ
 296 | ΕΙΣ
 297 | ΕΝαυτΔ
 298 | ΕΣΑμΕΑ
 299 | ΕΣΘ
 300 | ΕΣυγκΔ
 301 | ΕΤρΑξΧρΔ
 302 | Ε.Φ.Ε.Τ
 303 | Ε.Φ.Ι
 304 | Ε.Φ.Ο.Επ.Α
 305 | Εβδ
 306 | Εβρ
 307 | Εγκύκλ.Επιστ
 308 | Εγκ
 309 | Εε.Αιγ
 310 | Εθν.Κ.Τ
 311 | Εθν
 312 | Ειδ.Δικ.Αγ.Κακ
 313 | Εικ
 314 | Ειρ.Αθ
 315 | Ειρην.Αθ
 316 | Ειρην
 317 | Έλεγχ
 318 | Ειρ
 319 | Εισ.Α.Π
 320 | Εισ.Ε
 321 | Εισ.Ν.Α.Κ
 322 | Εισ.Ν.Κ.Πολ.Δ
 323 | Εισ.Πρωτ
 324 | Εισηγ.Έκθ
 325 | Εισ
 326 | Εκκλ
 327 | Εκκ
 328 | Εκ
 329 | Ελλ.Δνη
 330 | Εν.Ε
 331 | Εξ
 332 | Επ.Αν
 333 | Επ.Εργ.Δ
 334 | Επ.Εφ
 335 | Επ.Κυπ.Δ
 336 | Επ.Μεσ.Αρχ
 337 | Επ.Νομ
 338 | Επίκτ
 339 | Επίκ
 340 | Επι.Δ.Ε
 341 | Επιθ.Ναυτ.Δικ
 342 | Επικ
 343 | Επισκ.Ε.Δ
 344 | Επισκ.Εμπ.Δικ
 345 | Επιστ.Επετ.Αρμ
 346 | Επιστ.Επετ
 347 | Επιστ.Ιερ
 348 | Επιτρ.Προστ.Συνδ.Στελ
 349 | Επιφάν
 350 | Επτ.Εφ
 351 | Επ.Ιρ
 352 | Επ.Ι
 353 | Εργ.Ασφ.Νομ
 354 | Ερμ.Α.Κ
 355 | Ερμη.Σ
 356 | Εσθ
 357 | Εσπερ
 358 | Ετρ.Δ
 359 | Ευκλ
 360 | Ευρ.Δ.Δ.Α
 361 | Ευρ.Σ.Δ.Α
 362 | Ευρ.ΣτΕ
 363 | Ευρατόμ
 364 | Ευρ.Άλκ
 365 | Ευρ.Ανδρομ
 366 | Ευρ.Βάκχ
 367 | Ευρ.Εκ
 368 | Ευρ.Ελ
 369 | Ευρ.Ηλ
 370 | Ευρ.Ηρακ
 371 | Ευρ.Ηρ
 372 | Ευρ.Ηρ.Μαιν
 373 | Ευρ.Ικέτ
 374 | Ευρ.Ιππόλ
 375 | Ευρ.Ιφ.Α
 376 | Ευρ.Ιφ.Τ
 377 | Ευρ.Ι.Τ
 378 | Ευρ.Κύκλ
 379 | Ευρ.Μήδ
 380 | Ευρ.Ορ
 381 | Ευρ.Ρήσ
 382 | Ευρ.Τρωάδ
 383 | Ευρ.Φοίν
 384 | Εφ.Αθ
 385 | Εφ.Εν
 386 | Εφ.Επ
 387 | Εφ.Θρ
 388 | Εφ.Θ
 389 | Εφ.Ι
 390 | Εφ.Κερ
 391 | Εφ.Κρ
 392 | Εφ.Λ
 393 | Εφ.Ν
 394 | Εφ.Πατ
 395 | Εφ.Πειρ
 396 | Εφαρμ.Δ.Δ
 397 | Εφαρμ
 398 | Εφεσ
 399 | Εφημ
 400 | Εφ
 401 | Ζαχ
 402 | Ζιγ
 403 | Ζυ
 404 | Ζχ
 405 | ΗΕ.Δ
 406 | Ημερ
 407 | Ηράκλ
 408 | Ηροδ
 409 | Ησίοδ
 410 | Ησ
 411 | Η.Ε.Γ
 412 | ΘΗΣ
 413 | ΘΡ
 414 | Θαλ
 415 | Θεοδ
 416 | Θεοφ
 417 | Θεσ
 418 | Θεόδ.Μοψ
 419 | Θεόκρ
 420 | Θεόφιλ
 421 | Θουκ
 422 | Θρ
 423 | Θρ.Ε
 424 | Θρ.Ιερ
 425 | Θρ.Ιρ
 426 | Ιακ
 427 | Ιαν
 428 | Ιβ
 429 | Ιδθ
 430 | Ιδ
 431 | Ιεζ
 432 | Ιερ
 433 | Ιζ
 434 | Ιησ
 435 | Ιησ.Ν
 436 | Ικ
 437 | Ιλ
 438 | Ιν
 439 | Ιουδ
 440 | Ιουστ
 441 | Ιούδα
 442 | Ιούλ
 443 | Ιούν
 444 | Ιπποκρ
 445 | Ιππόλ
 446 | Ιρ
 447 | Ισίδ.Πηλ
 448 | Ισοκρ
 449 | Ισ.Ν
 450 | Ιωβ
 451 | Ιωλ
 452 | Ιων
 453 | Ιω
 454 | ΚΟΣ
 455 | ΚΟ.ΜΕ.ΚΟΝ
 456 | ΚΠοινΔ
 457 | ΚΠολΔ
 458 | ΚαΒ
 459 | Καλ
 460 | Καλ.Τέχν
 461 | ΚανΒ
 462 | Καν.Διαδ
 463 | Κατάργ
 464 | Κλ
 465 | ΚοινΔ
 466 | Κολσ
 467 | Κολ
 468 | Κον
 469 | Κορ
 470 | Κος
 471 | ΚριτΕπιθ
 472 | ΚριτΕ
 473 | Κριτ
 474 | Κρ
 475 | ΚτΒ
 476 | ΚτΕ
 477 | ΚτΠ
 478 | Κυβ
 479 | Κυπρ
 480 | Κύριλ.Αλεξ
 481 | Κύριλ.Ιερ
 482 | Λεβ
 483 | Λεξ.Σουίδα
 484 | Λευϊτ
 485 | Λευ
 486 | Λκ
 487 | Λογ
 488 | ΛουκΑμ
 489 | Λουκιαν
 490 | Λουκ.Έρωτ
 491 | Λουκ.Ενάλ.Διάλ
 492 | Λουκ.Ερμ
 493 | Λουκ.Εταιρ.Διάλ
 494 | Λουκ.Ε.Δ
 495 | Λουκ.Θε.Δ
 496 | Λουκ.Ικ.
 497 | Λουκ.Ιππ
 498 | Λουκ.Λεξιφ
 499 | Λουκ.Μεν
 500 | Λουκ.Μισθ.Συν
 501 | Λουκ.Ορχ
 502 | Λουκ.Περ
 503 | Λουκ.Συρ
 504 | Λουκ.Τοξ
 505 | Λουκ.Τυρ
 506 | Λουκ.Φιλοψ
 507 | Λουκ.Φιλ
 508 | Λουκ.Χάρ
 509 | Λουκ.
 510 | Λουκ.Αλ
 511 | Λοχ
 512 | Λυδ
 513 | Λυκ
 514 | Λυσ
 515 | Λωζ
 516 | Λ1
 517 | Λ2
 518 | ΜΟΕφ
 519 | Μάρκ
 520 | Μέν
 521 | Μαλ
 522 | Ματθ
 523 | Μα
 524 | Μιχ
 525 | Μκ
 526 | Μλ
 527 | Μμ
 528 | Μον.Δ.Π
 529 | Μον.Πρωτ
 530 | Μον
 531 | Μρ
 532 | Μτ
 533 | Μχ
 534 | Μ.Βασ
 535 | Μ.Πλ
 536 | ΝΑ
 537 | Ναυτ.Χρον
 538 | Να
 539 | Νδικ
 540 | Νεεμ
 541 | Νε
 542 | Νικ
 543 | ΝκΦ
 544 | Νμ
 545 | ΝοΒ
 546 | Νομ.Δελτ.Τρ.Ελ
 547 | Νομ.Δελτ
 548 | Νομ.Σ.Κ
 549 | Νομ.Χρ
 550 | Νομ
 551 | Νομ.Διεύθ
 552 | Νοσ
 553 | Ντ
 554 | Νόσων
 555 | Ν1
 556 | Ν2
 557 | Ν3
 558 | Ν4
 559 | Νtot
 560 | Ξενοφ
 561 | Ξεν
 562 | Ξεν.Ανάβ
 563 | Ξεν.Απολ
 564 | Ξεν.Απομν
 565 | Ξεν.Απομ
 566 | Ξεν.Ελλ
 567 | Ξεν.Ιέρ
 568 | Ξεν.Ιππαρχ
 569 | Ξεν.Ιππ
 570 | Ξεν.Κυρ.Αν
 571 | Ξεν.Κύρ.Παιδ
 572 | Ξεν.Κ.Π
 573 | Ξεν.Λακ.Πολ
 574 | Ξεν.Οικ
 575 | Ξεν.Προσ
 576 | Ξεν.Συμπόσ
 577 | Ξεν.Συμπ
 578 | Ο΄
 579 | Οβδ
 580 | Οβ
 581 | ΟικΕ
 582 | Οικ
 583 | Οικ.Πατρ
 584 | Οικ.Σύν.Βατ
 585 | Ολομ
 586 | Ολ
 587 | Ολ.Α.Π
 588 | Ομ.Ιλ
 589 | Ομ.Οδ
 590 | ΟπΤοιχ
 591 | Οράτ
 592 | Ορθ
 593 | ΠΡΟ.ΠΟ
 594 | Πίνδ
 595 | Πίνδ.Ι
 596 | Πίνδ.Νεμ
 597 | Πίνδ.Ν
 598 | Πίνδ.Ολ
 599 | Πίνδ.Παθ
 600 | Πίνδ.Πυθ
 601 | Πίνδ.Π
 602 | ΠαγΝμλγ
 603 | Παν
 604 | Παρμ
 605 | Παροιμ
 606 | Παρ
 607 | Παυσ
 608 | Πειθ.Συμβ
 609 | ΠειρΝ
 610 | Πελ
 611 | ΠεντΣτρ
 612 | Πεντ
 613 | Πεντ.Εφ
 614 | ΠερΔικ
 615 | Περ.Γεν.Νοσ
 616 | Πετ
 617 | Πλάτ
 618 | Πλάτ.Αλκ
 619 | Πλάτ.Αντ
 620 | Πλάτ.Αξίοχ
 621 | Πλάτ.Απόλ
 622 | Πλάτ.Γοργ
 623 | Πλάτ.Ευθ
 624 | Πλάτ.Θεαίτ
 625 | Πλάτ.Κρατ
 626 | Πλάτ.Κριτ
 627 | Πλάτ.Λύσ
 628 | Πλάτ.Μεν
 629 | Πλάτ.Νόμ
 630 | Πλάτ.Πολιτ
 631 | Πλάτ.Πολ
 632 | Πλάτ.Πρωτ
 633 | Πλάτ.Σοφ.
 634 | Πλάτ.Συμπ
 635 | Πλάτ.Τίμ
 636 | Πλάτ.Φαίδρ
 637 | Πλάτ.Φιλ
 638 | Πλημ
 639 | Πλούτ
 640 | Πλούτ.Άρατ
 641 | Πλούτ.Αιμ
 642 | Πλούτ.Αλέξ
 643 | Πλούτ.Αλκ
 644 | Πλούτ.Αντ
 645 | Πλούτ.Αρτ
 646 | Πλούτ.Ηθ
 647 | Πλούτ.Θεμ
 648 | Πλούτ.Κάμ
 649 | Πλούτ.Καίσ
 650 | Πλούτ.Κικ
 651 | Πλούτ.Κράσ
 652 | Πλούτ.Κ
 653 | Πλούτ.Λυκ
 654 | Πλούτ.Μάρκ
 655 | Πλούτ.Μάρ
 656 | Πλούτ.Περ
 657 | Πλούτ.Ρωμ
 658 | Πλούτ.Σύλλ
 659 | Πλούτ.Φλαμ
 660 | Πλ
 661 | Ποιν.Δικ
 662 | Ποιν.Δ
 663 | Ποιν.Ν
 664 | Ποιν.Χρον
 665 | Ποιν.Χρ
 666 | Πολ.Δ
 667 | Πολ.Πρωτ
 668 | Πολ
 669 | Πολ.Μηχ
 670 | Πολ.Μ
 671 | Πρακτ.Αναθ
 672 | Πρακτ.Ολ
 673 | Πραξ
 674 | Πρμ
 675 | Πρξ
 676 | Πρωτ
 677 | Πρ
 678 | Πρ.Αν
 679 | Πρ.Λογ
 680 | Πταισμ
 681 | Πυρ.Καλ
 682 | Πόλη
 683 | Π.Δ
 684 | Π.Δ.Άσμ
 685 | ΡΜ.Ε
 686 | Ρθ
 687 | Ρμ
 688 | Ρωμ
 689 | ΣΠλημ
 690 | Σαπφ
 691 | Σειρ
 692 | Σολ
 693 | Σοφ
 694 | Σοφ.Αντιγ
 695 | Σοφ.Αντ
 696 | Σοφ.Αποσ
 697 | Σοφ.Απ
 698 | Σοφ.Ηλέκ
 699 | Σοφ.Ηλ
 700 | Σοφ.Οιδ.Κολ
 701 | Σοφ.Οιδ.Τύρ
 702 | Σοφ.Ο.Τ
 703 | Σοφ.Σειρ
 704 | Σοφ.Σολ
 705 | Σοφ.Τραχ
 706 | Σοφ.Φιλοκτ
 707 | Σρ
 708 | Σ.τ.Ε
 709 | Σ.τ.Π
 710 | Στρ.Π.Κ
 711 | Στ.Ευρ
 712 | Συζήτ
 713 | Συλλ.Νομολ
 714 | Συλ.Νομ
 715 | ΣυμβΕπιθ
 716 | Συμπ.Ν
 717 | Συνθ.Αμ
 718 | Συνθ.Ε.Ε
 719 | Συνθ.Ε.Κ
 720 | Συνθ.Ν
 721 | Σφν
 722 | Σφ
 723 | Σφ.Σλ
 724 | Σχ.Πολ.Δ
 725 | Σχ.Συντ.Ε
 726 | Σωσ
 727 | Σύντ
 728 | Σ.Πληρ
 729 | ΤΘ
 730 | ΤΣ.Δ
 731 | Τίτ
 732 | Τβ
 733 | Τελ.Ενημ
 734 | Τελ.Κ
 735 | Τερτυλ
 736 | Τιμ
 737 | Τοπ.Α
 738 | Τρ.Ο
 739 | Τριμ
 740 | Τριμ.Πλ
 741 | Τρ.Πλημ
 742 | Τρ.Π.Δ
 743 | Τ.τ.Ε
 744 | Ττ
 745 | Τωβ
 746 | Υγ
 747 | Υπερ
 748 | Υπ
 749 | Υ.Γ
 750 | Φιλήμ
 751 | Φιλιπ
 752 | Φιλ
 753 | Φλμ
 754 | Φλ
 755 | Φορ.Β
 756 | Φορ.Δ.Ε
 757 | Φορ.Δνη
 758 | Φορ.Δ
 759 | Φορ.Επ
 760 | Φώτ
 761 | Χρ.Ι.Δ
 762 | Χρ.Ιδ.Δ
 763 | Χρ.Ο
 764 | Χρυσ
 765 | Ψήφ
 766 | Ψαλμ
 767 | Ψαλ
 768 | Ψλ
 769 | Ωριγ
 770 | Ωσ
 771 | Ω.Ρ.Λ
 772 | άγν
 773 | άγν.ετυμολ
 774 | άγ
 775 | άκλ
 776 | άνθρ
 777 | άπ
 778 | άρθρ
 779 | άρν
 780 | άρ
 781 | άτ
 782 | άψ
 783 | ά
 784 | έκδ
 785 | έκφρ
 786 | έμψ
 787 | ένθ.αν
 788 | έτ
 789 | έ.α
 790 | ίδ
 791 | αβεστ
 792 | αβησσ
 793 | αγγλ
 794 | αγγ
 795 | αδημ
 796 | αεροναυτ
 797 | αερον
 798 | αεροπ
 799 | αθλητ
 800 | αθλ
 801 | αθροιστ
 802 | αιγυπτ
 803 | αιγ
 804 | αιτιολ
 805 | αιτ
 806 | αι
 807 | ακαδ
 808 | ακκαδ
 809 | αλβ
 810 | αλλ
 811 | αλφαβητ
 812 | αμα
 813 | αμερικ
 814 | αμερ
 815 | αμετάβ
 816 | αμτβ
 817 | αμφιβ
 818 | αμφισβ
 819 | αμφ
 820 | αμ
 821 | ανάλ
 822 | ανάπτ
 823 | ανάτ
 824 | αναβ
 825 | αναδαν
 826 | αναδιπλασ
 827 | αναδιπλ
 828 | αναδρ
 829 | αναλ
 830 | αναν
 831 | ανασυλλ
 832 | ανατολ
 833 | ανατομ
 834 | ανατυπ
 835 | ανατ
 836 | αναφορ
 837 | αναφ
 838 | ανα.ε
 839 | ανδρων
 840 | ανθρωπολ
 841 | ανθρωπ
 842 | ανθ
 843 | ανομ
 844 | αντίτ
 845 | αντδ
 846 | αντιγρ
 847 | αντιθ
 848 | αντικ
 849 | αντιμετάθ
 850 | αντων
 851 | αντ
 852 | ανωτ
 853 | ανόργ
 854 | ανών
 855 | αορ
 856 | απαρέμφ
 857 | απαρφ
 858 | απαρχ
 859 | απαρ
 860 | απλολ
 861 | απλοπ
 862 | αποβ
 863 | αποηχηροπ
 864 | αποθ
 865 | αποκρυφ
 866 | αποφ
 867 | απρμφ
 868 | απρφ
 869 | απρόσ
 870 | απόδ
 871 | απόλ
 872 | απόσπ
 873 | απόφ
 874 | αραβοτουρκ
 875 | αραβ
 876 | αραμ
 877 | αρβαν
 878 | αργκ
 879 | αριθμτ
 880 | αριθμ
 881 | αριθ
 882 | αρκτικόλ
 883 | αρκ
 884 | αρμεν
 885 | αρμ
 886 | αρνητ
 887 | αρσ
 888 | αρχαιολ
 889 | αρχιτεκτ
 890 | αρχιτ
 891 | αρχκ
 892 | αρχ
 893 | αρωμουν
 894 | αρωμ
 895 | αρ
 896 | αρ.μετρ
 897 | αρ.φ
 898 | ασσυρ
 899 | αστρολ
 900 | αστροναυτ
 901 | αστρον
 902 | αττ
 903 | αυστραλ
 904 | αυτοπ
 905 | αυτ
 906 | αφγαν
 907 | αφηρ
 908 | αφομ
 909 | αφρικ
 910 | αχώρ
 911 | αόρ
 912 | α.α
 913 | α/α
 914 | α0
 915 | βαθμ
 916 | βαθ
 917 | βαπτ
 918 | βασκ
 919 | βεβαιωτ
 920 | βεβ
 921 | βεδ
 922 | βενετ
 923 | βεν
 924 | βερβερ
 925 | βιβλγρ
 926 | βιολ
 927 | βιομ
 928 | βιοχημ
 929 | βιοχ
 930 | βλάχ
 931 | βλ
 932 | βλ.λ
 933 | βοταν
 934 | βοτ
 935 | βουλγαρ
 936 | βουλγ
 937 | βούλ
 938 | βραζιλ
 939 | βρετον
 940 | βόρ
 941 | γαλλ
 942 | γενικότ
 943 | γενοβ
 944 | γεν
 945 | γερμαν
 946 | γερμ
 947 | γεωγρ
 948 | γεωλ
 949 | γεωμετρ
 950 | γεωμ
 951 | γεωπ
 952 | γεωργ
 953 | γλυπτ
 954 | γλωσσολ
 955 | γλωσσ
 956 | γλ
 957 | γνμδ
 958 | γνμ
 959 | γνωμ
 960 | γοτθ
 961 | γραμμ
 962 | γραμ
 963 | γρμ
 964 | γρ
 965 | γυμν
 966 | δίδες
 967 | δίκ
 968 | δίφθ
 969 | δαν
 970 | δεικτ
 971 | δεκατ
 972 | δηλ
 973 | δημογρ
 974 | δημοτ
 975 | δημώδ
 976 | δημ
 977 | διάγρ
 978 | διάκρ
 979 | διάλεξ
 980 | διάλ
 981 | διάσπ
 982 | διαλεκτ
 983 | διατρ
 984 | διαφ
 985 | διαχ
 986 | διδα
 987 | διεθν
 988 | διεθ
 989 | δικον
 990 | διστ
 991 | δισύλλ
 992 | δισ
 993 | διφθογγοπ
 994 | δογμ
 995 | δολ
 996 | δοτ
 997 | δρμ
 998 | δρχ
 999 | δρ(α)
1000 | δωρ
1001 | δ
1002 | εβρ
1003 | εγκλπ
1004 | εδ
1005 | εθνολ
1006 | εθν
1007 | ειδικότ
1008 | ειδ
1009 | ειδ.β
1010 | εικ
1011 | ειρ
1012 | εισ
1013 | εκατοστμ
1014 | εκατοστ
1015 | εκατστ.2
1016 | εκατστ.3
1017 | εκατ
1018 | εκδ
1019 | εκκλησ
1020 | εκκλ
1021 | εκ
1022 | ελλην
1023 | ελλ
1024 | ελνστ
1025 | ελπ
1026 | εμβ
1027 | εμφ
1028 | εναλλ
1029 | ενδ
1030 | ενεργ
1031 | ενεστ
1032 | ενικ
1033 | ενν
1034 | εν
1035 | εξέλ
1036 | εξακολ
1037 | εξομάλ
1038 | εξ
1039 | εο
1040 | επέκτ
1041 | επίδρ
1042 | επίθ
1043 | επίρρ
1044 | επίσ
1045 | επαγγελμ
1046 | επανάλ
1047 | επανέκδ
1048 | επιθ
1049 | επικ
1050 | επιμ
1051 | επιρρ
1052 | επιστ
1053 | επιτατ
1054 | επιφ
1055 | επών
1056 | επ
1057 | εργ
1058 | ερμ
1059 | ερρινοπ
1060 | ερωτ
1061 | ετρουσκ
1062 | ετυμ
1063 | ετ
1064 | ευφ
1065 | ευχετ
1066 | εφ
1067 | εύχρ
1068 | ε.α
1069 | ε/υ
1070 | ε0
1071 | ζωγρ
1072 | ζωολ
1073 | ηθικ
1074 | ηθ
1075 | ηλεκτρολ
1076 | ηλεκτρον
1077 | ηλεκτρ
1078 | ημίτ
1079 | ημίφ
1080 | ημιφ
1081 | ηχηροπ
1082 | ηχηρ
1083 | ηχομιμ
1084 | ηχ
1085 | η
1086 | θέατρ
1087 | θεολ
1088 | θετ
1089 | θηλ
1090 | θρακ
1091 | θρησκειολ
1092 | θρησκ
1093 | θ
1094 | ιαπων
1095 | ιατρ
1096 | ιδιωμ
1097 | ιδ
1098 | ινδ
1099 | ιραν
1100 | ισπαν
1101 | ιστορ
1102 | ιστ
1103 | ισχυροπ
1104 | ιταλ
1105 | ιχθυολ
1106 | ιων
1107 | κάτ
1108 | καθ
1109 | κακοσ
1110 | καν
1111 | καρ
1112 | κατάλ
1113 | κατατ
1114 | κατωτ
1115 | κατ
1116 | κα
1117 | κελτ
1118 | κεφ
1119 | κινεζ
1120 | κινημ
1121 | κλητ
1122 | κλιτ
1123 | κλπ
1124 | κλ
1125 | κν
1126 | κοινωνιολ
1127 | κοινων
1128 | κοπτ
1129 | κουτσοβλαχ
1130 | κουτσοβλ
1131 | κπ
1132 | κρ.γν
1133 | κτγ
1134 | κτην
1135 | κτητ
1136 | κτλ
1137 | κτ
1138 | κυριολ
1139 | κυρ
1140 | κύρ
1141 | κ
1142 | κ.ά
1143 | κ.ά.π
1144 | κ.α
1145 | κ.εξ
1146 | κ.επ
1147 | κ.ε
1148 | κ.λπ
1149 | κ.λ.π
1150 | κ.ού.κ
1151 | κ.ο.κ
1152 | κ.τ.λ
1153 | κ.τ.τ
1154 | κ.τ.ό
1155 | λέξ
1156 | λαογρ
1157 | λαπ
1158 | λατιν
1159 | λατ
1160 | λαϊκότρ
1161 | λαϊκ
1162 | λετ
1163 | λιθ
1164 | λογιστ
1165 | λογοτ
1166 | λογ
1167 | λουβ
1168 | λυδ
1169 | λόγ
1170 | λ
1171 | λ.χ
1172 | μέλλ
1173 | μέσ
1174 | μαθημ
1175 | μαθ
1176 | μαιευτ
1177 | μαλαισ
1178 | μαλτ
1179 | μαμμων
1180 | μεγεθ
1181 | μεε
1182 | μειωτ
1183 | μελ
1184 | μεξ
1185 | μεσν
1186 | μεσογ
1187 | μεσοπαθ
1188 | μεσοφ
1189 | μετάθ
1190 | μεταβτ
1191 | μεταβ
1192 | μετακ
1193 | μεταπλ
1194 | μεταπτωτ
1195 | μεταρ
1196 | μεταφορ
1197 | μετβ
1198 | μετεπιθ
1199 | μετεπιρρ
1200 | μετεωρολ
1201 | μετεωρ
1202 | μετον
1203 | μετουσ
1204 | μετοχ
1205 | μετρ
1206 | μετ
1207 | μητρων
1208 | μηχανολ
1209 | μηχ
1210 | μικροβιολ
1211 | μογγολ
1212 | μορφολ
1213 | μουσ
1214 | μπενελούξ
1215 | μσνλατ
1216 | μσν
1217 | μτβ
1218 | μτγν
1219 | μτγ
1220 | μτφρδ
1221 | μτφρ
1222 | μτφ
1223 | μτχ
1224 | μυθ
1225 | μυκην
1226 | μυκ
1227 | μφ
1228 | μ
1229 | μ.ε
1230 | μ.μ
1231 | μ.π.ε
1232 | μ.π.π
1233 | μ0
1234 | ναυτ
1235 | νεοελλ
1236 | νεολατιν
1237 | νεολατ
1238 | νεολ
1239 | νεότ
1240 | νλατ
1241 | νομ
1242 | νορβ
1243 | νοσ
1244 | νότ
1245 | ν
1246 | ξ.λ
1247 | οικοδ
1248 | οικολ
1249 | οικον
1250 | οικ
1251 | ολλανδ
1252 | ολλ
1253 | ομηρ
1254 | ομόρρ
1255 | ονομ
1256 | ον
1257 | οπτ
1258 | ορθογρ
1259 | ορθ
1260 | οριστ
1261 | ορυκτολ
1262 | ορυκτ
1263 | ορ
1264 | οσετ
1265 | οσκ
1266 | ουαλ
1267 | ουγγρ
1268 | ουδ
1269 | ουσιαστικοπ
1270 | ουσιαστ
1271 | ουσ
1272 | πίν
1273 | παθητ
1274 | παθολ
1275 | παθ
1276 | παιδ
1277 | παλαιοντ
1278 | παλαιότ
1279 | παλ
1280 | παππων
1281 | παράγρ
1282 | παράγ
1283 | παράλλ
1284 | παράλ
1285 | παραγ
1286 | παρακ
1287 | παραλ
1288 | παραπ
1289 | παρατ
1290 | παρβ
1291 | παρετυμ
1292 | παροξ
1293 | παρων
1294 | παρωχ
1295 | παρ
1296 | παρ.φρ
1297 | πατριδων
1298 | πατρων
1299 | πβ
1300 | περιθ
1301 | περιλ
1302 | περιφρ
1303 | περσ
1304 | περ
1305 | πιθ
1306 | πληθ
1307 | πληροφ
1308 | ποδ
1309 | ποιητ
1310 | πολιτ
1311 | πολλαπλ
1312 | πολ
1313 | πορτογαλ
1314 | πορτ
1315 | ποσ
1316 | πρακριτ
1317 | πρβλ
1318 | πρβ
1319 | πργ
1320 | πρκμ
1321 | πρκ
1322 | πρλ
1323 | προέλ
1324 | προβηγκ
1325 | προελλ
1326 | προηγ
1327 | προθεμ
1328 | προπαραλ
1329 | προπαροξ
1330 | προπερισπ
1331 | προσαρμ
1332 | προσηγορ
1333 | προσταχτ
1334 | προστ
1335 | προσφών
1336 | προσ
1337 | προτακτ
1338 | προτ.Εισ
1339 | προφ
1340 | προχωρ
1341 | πρτ
1342 | πρόθ
1343 | πρόσθ
1344 | πρόσ
1345 | πρότ
1346 | πρ
1347 | πρ.Εφ
1348 | πτ
1349 | πυ
1350 | π
1351 | π.Χ
1352 | π.μ
1353 | π.χ
1354 | ρήμ
1355 | ρίζ
1356 | ρηματ
1357 | ρητορ
1358 | ριν
1359 | ρουμ
1360 | ρωμ
1361 | ρωσ
1362 | ρ
1363 | σανσκρ
1364 | σαξ
1365 | σελ
1366 | σερβοκρ
1367 | σερβ
1368 | σημασιολ
1369 | σημδ
1370 | σημειολ
1371 | σημερ
1372 | σημιτ
1373 | σημ
1374 | σκανδ
1375 | σκυθ
1376 | σκωπτ
1377 | σλαβ
1378 | σλοβ
1379 | σουηδ
1380 | σουμερ
1381 | σουπ
1382 | σπάν
1383 | σπανιότ
1384 | σπ
1385 | σσ
1386 | στατ
1387 | στερ
1388 | στιγμ
1389 | στιχ
1390 | στρέμ
1391 | στρατιωτ
1392 | στρατ
1393 | στ
1394 | συγγ
1395 | συγκρ
1396 | συγκ
1397 | συμπερ
1398 | συμπλεκτ
1399 | συμπλ
1400 | συμπροφ
1401 | συμφυρ
1402 | συμφ
1403 | συνήθ
1404 | συνίζ
1405 | συναίρ
1406 | συναισθ
1407 | συνδετ
1408 | συνδ
1409 | συνεκδ
1410 | συνηρ
1411 | συνθετ
1412 | συνθ
1413 | συνοπτ
1414 | συντελ
1415 | συντομογρ
1416 | συντ
1417 | συν
1418 | συρ
1419 | σχημ
1420 | σχ
1421 | σύγκρ
1422 | σύμπλ
1423 | σύμφ
1424 | σύνδ
1425 | σύνθ
1426 | σύντμ
1427 | σύντ
1428 | σ
1429 | σ.π
1430 | σ/β
1431 | τακτ
1432 | τελ
1433 | τετρ
1434 | τετρ.μ
1435 | τεχνλ
1436 | τεχνολ
1437 | τεχν
1438 | τεύχ
1439 | τηλεπικ
1440 | τηλεόρ
1441 | τιμ
1442 | τιμ.τομ
1443 | τοΣ
1444 | τον
1445 | τοπογρ
1446 | τοπων
1447 | τοπ
1448 | τοσκ
1449 | τουρκ
1450 | τοχ
1451 | τριτοπρόσ
1452 | τροποπ
1453 | τροπ
1454 | τσεχ
1455 | τσιγγ
1456 | ττ
1457 | τυπ
1458 | τόμ
1459 | τόνν
1460 | τ
1461 | τ.μ
1462 | τ.χλμ
1463 | υβρ
1464 | υπερθ
1465 | υπερσ
1466 | υπερ
1467 | υπεύθ
1468 | υποθ
1469 | υποκορ
1470 | υποκ
1471 | υποσημ
1472 | υποτ
1473 | υποφ
1474 | υποχωρ
1475 | υπόλ
1476 | υπόχρ
1477 | υπ
1478 | υστλατ
1479 | υψόμ
1480 | υψ
1481 | φάκ
1482 | φαρμακολ
1483 | φαρμ
1484 | φιλολ
1485 | φιλοσ
1486 | φιλοτ
1487 | φινλ
1488 | φοινικ
1489 | φράγκ
1490 | φρανκον
1491 | φριζ
1492 | φρ
1493 | φυλλ
1494 | φυσιολ
1495 | φυσ
1496 | φωνηεντ
1497 | φωνητ
1498 | φωνολ
1499 | φων
1500 | φωτογρ
1501 | φ
1502 | φ.τ.μ
1503 | χαμιτ
1504 | χαρτόσ
1505 | χαρτ
1506 | χασμ
1507 | χαϊδ
1508 | χγφ
1509 | χειλ
1510 | χεττ
1511 | χημ
1512 | χιλ
1513 | χλγρ
1514 | χλγ
1515 | χλμ
1516 | χλμ.2
1517 | χλμ.3
1518 | χλσγρ
1519 | χλστγρ
1520 | χλστμ
1521 | χλστμ.2
1522 | χλστμ.3
1523 | χλ
1524 | χργρ
1525 | χρημ
1526 | χρον
1527 | χρ
1528 | χφ
1529 | χ.ε
1530 | χ.κ
1531 | χ.ο
1532 | χ.σ
1533 | χ.τ
1534 | χ.χ
1535 | ψευδ
1536 | ψυχαν
1537 | ψυχιατρ
1538 | ψυχολ
1539 | ψυχ
1540 | ωκεαν
1541 | όμ
1542 | όν
1543 | όπ.παρ
1544 | όπ.π
1545 | ό.π
1546 | ύψ
1547 | 1Βσ
1548 | 1Εσ
1549 | 1Θσ
1550 | 1Ιν
1551 | 1Κρ
1552 | 1Μκ
1553 | 1Πρ
1554 | 1Πτ
1555 | 1Τμ
1556 | 2Βσ
1557 | 2Εσ
1558 | 2Θσ
1559 | 2Ιν
1560 | 2Κρ
1561 | 2Μκ
1562 | 2Πρ
1563 | 2Πτ
1564 | 2Τμ
1565 | 3Βσ
1566 | 3Ιν
1567 | 3Μκ
1568 | 4Βσ
1569 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | ﻿#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 | 
211 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/data/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script preprocesses bitext with Byte Pair Encoding for NMT.
 3 | # Executes the following steps:
 4 | #     1. Tokenize source and target side of bitext
 5 | #     2. Learn BPE-codes for both source and target side
 6 | #     3. Encode source and target side using the codes learned
 7 | #     4. Shuffle bitext for SGD
 8 | #     5. Build source and target dictionaries
 9 | 
10 | if [ "$#" -ne 4 ]; then
11 |     echo ""
12 |     echo "Usage: $0 src trg path_to_data path_to_subword"
13 |     echo ""
14 |     exit 1
15 | fi
16 | 
17 | if [ -z $PYTHON ]; then
18 |     if [ -n `which python3` ]; then
19 |         export PYTHON=python3
20 |     else
21 |         if [ -n `which python2`]; then
22 |             export PYTHON=python2
23 |         else
24 |             if [ -n `which python`]; then
25 |                 export PYTHON=python
26 |             fi
27 |         fi
28 |     fi 
29 | fi
30 | 
31 | if [ -z $PYTHON ]; then
32 |     echo "Please set PYTHON to a Python interpreter"
33 |     exit 1 
34 | fi
35 | 
36 | echo "Using $PYTHON"
37 | 
38 | # number of merge ops (codes) for bpe
39 | SRC_CODE_SIZE=20000
40 | TRG_CODE_SIZE=20000
41 | 
42 | # source language (example: fr)
43 | S=$1
44 | # target language (example: en)
45 | T=$2
46 | 
47 | # path to dl4mt/data
48 | P1=$3
49 | 
50 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt)
51 | P2=$4
52 | 
53 | 
54 | # merge all parallel corpora
55 | ./merge.sh $1 $2 $3
56 | 
57 | # tokenize training and validation data
58 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/all_${S}-${T}.${S} > ${P1}/all_${S}-${T}.${S}.tok
59 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/all_${S}-${T}.${T} > ${P1}/all_${S}-${T}.${T}.tok
60 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/test2011/newstest2011.${S} > ${P1}/newstest2011.${S}.tok
61 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/test2011/newstest2011.${T} > ${P1}/newstest2011.${T}.tok
62 | 
63 | # BPE
64 | if [ ! -f "${S}.bpe" ]; then
65 |     $PYTHON $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${S}.tok > ${S}.bpe
66 | fi
67 | if [ ! -f "${T}.bpe" ]; then
68 |     $PYTHON $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${T}.tok > ${T}.bpe
69 | fi
70 | 
71 | # utility function to encode a file with bpe
72 | encode () {
73 |     if [ ! -f "$3" ]; then
74 |         $PYTHON $P2/apply_bpe.py -c $1 < $2 > $3 
75 |     else
76 |         echo "$3 exists, pass"
77 |     fi
78 | }
79 | 
80 | # apply bpe to training data
81 | encode ${S}.bpe ${P1}/all_${S}-${T}.${S}.tok ${P1}/all_${S}-${T}.${S}.tok.bpe
82 | encode ${T}.bpe ${P1}/all_${S}-${T}.${T}.tok ${P1}/all_${S}-${T}.${T}.tok.bpe
83 | encode ${S}.bpe ${P1}/newstest2011.${S}.tok ${P1}/newstest2011.${S}.tok.bpe
84 | encode ${T}.bpe ${P1}/newstest2011.${T}.tok ${P1}/newstest2011.${T}.tok.bpe
85 | 
86 | # shuffle 
87 | $PYTHON $P1/shuffle.py all_${S}-${T}.${S}.tok.bpe all_${S}-${T}.${T}.tok.bpe
88 | 
89 | # build dictionary
90 | $PYTHON $P1/build_dictionary.py all_${S}-${T}.${S}.tok.bpe
91 | $PYTHON $P1/build_dictionary.py all_${S}-${T}.${T}.tok.bpe
92 | 
93 | 


--------------------------------------------------------------------------------
/data/scan_example.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy
 4 | import theano
 5 | 
 6 | from theano import tensor
 7 | 
 8 | 
 9 | # some numbers
10 | n_steps = 10
11 | n_samples = 5
12 | dim = 10
13 | input_dim = 20
14 | output_dim = 2
15 | 
16 | 
17 | # one step function that will be used by scan
18 | def oneStep(x_t, h_tm1, W_x, W_h, W_o):
19 | 
20 |     h_t = tensor.tanh(tensor.dot(x_t, W_x) +
21 |                       tensor.dot(h_tm1, W_h))
22 |     o_t = tensor.dot(h_t, W_o)
23 | 
24 |     return h_t, o_t
25 | 
26 | # spawn theano tensor variable, our symbolic input
27 | # a 3D tensor (n_steps, n_samples, dim)
28 | x = tensor.tensor3(dtype='float32')
29 | 
30 | # initial state of our rnn
31 | init_state = tensor.alloc(0., n_samples, dim)
32 | 
33 | # create parameters that we will use,
34 | # note that, parameters are theano shared variables
35 | 
36 | # parameters for input to hidden states
37 | W_x_ = numpy.random.randn(input_dim, dim).astype('float32')
38 | W_x = theano.shared(W_x_)
39 | 
40 | # parameters for hidden state transition
41 | W_h_ = numpy.random.randn(dim, dim).astype('float32')
42 | W_h = theano.shared(W_h_)
43 | 
44 | # parameters from hidden state to output
45 | W_o_ = numpy.random.randn(dim, output_dim).astype('float32')
46 | W_o = theano.shared(W_o_)
47 | 
48 | # scan function
49 | ([h_vals, o_vals], updates) = theano.scan(
50 |     fn=oneStep,
51 |     sequences=[x],
52 |     outputs_info=[init_state, None],
53 |     non_sequences=[W_x, W_h, W_o],
54 |     n_steps=n_steps,
55 |     strict=True)
56 | 
57 | # let us now compile a function to get the output
58 | f = theano.function([x], [h_vals, o_vals])
59 | 
60 | # now we will call the compiled function with actual input
61 | actual_input = numpy.random.randn(
62 |     n_steps, n_samples, input_dim).astype('float32')
63 | h_vals_, o_vals_ = f(actual_input)
64 | 
65 | # print the shapes
66 | print('shape of input :', actual_input.shape)
67 | print('shape of h_vals:', h_vals_.shape)
68 | print('shape of o_vals:', o_vals_.shape)
69 | 


--------------------------------------------------------------------------------
/data/setup_cluster_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script sets up development and data environments for 
 3 | # fionn cluster, copy under your home directory and run.
 4 | 
 5 | if [ -z $PYTHON ]; then
 6 |     if [ -n `which python3` ]; then
 7 |         export PYTHON=python3
 8 |     else
 9 |         if [ -n `which python2`]; then
10 |             export PYTHON=python2
11 |         else
12 |             if [ -n `which python`]; then
13 |                 export PYTHON=python
14 |             fi
15 |         fi
16 |     fi 
17 | fi
18 | 
19 | if [ -z $PYTHON ]; then
20 |     echo "Please set PYTHON to a Python interpreter"
21 |     exit 1 
22 | fi
23 | 
24 | echo "Using $PYTHON"
25 | 
26 | # this file is for the dependencies
27 | LOCAL_INSTALL_FILE=/ichec/work/dl4mt_data/local_install.tgz
28 | 
29 | # code directory for cloned repositories
30 | CODE_DIR=${HOME}/codes/dl4mt-material
31 | 
32 | # code repository 
33 | CODE_CENTRAL=https://github.com/nyu-dl/dl4mt-tutorial
34 | 
35 | # reference files directory
36 | REF_DATA_DIR=/ichec/work/dl4mt_data/nec_files
37 | 
38 | # our input files will reside here
39 | DATA_DIR=${HOME}/data
40 | 
41 | # our trained models will be saved here
42 | MODELS_DIR=${HOME}/models
43 | 
44 | # theano repository
45 | THEANO_GIT=https://github.com/Theano/Theano.git
46 | 
47 | # theano install dir
48 | THEANO_DIR=${HOME}/repo/Theano
49 | 
50 | # move to home directory
51 | cd
52 | 
53 | # copy dependency file to your local and extract 
54 | echo "Copying and extracting dependency file"
55 | rsync --bwlimit=20000 -Pavz ${LOCAL_INSTALL_FILE} ${HOME}
56 | tar zxvf ${HOME}/local_install.tgz
57 | 
58 | # clone the repository from github into code directory
59 | echo "Cloning lab repository"
60 | if [ ! -d "${CODE_DIR}" ]; then
61 |     mkdir -p ${CODE_DIR}
62 | fi
63 | git clone ${CODE_CENTRAL} ${CODE_DIR}
64 | 
65 | # copy corpora, dictionaries etc for training and dev
66 | echo "Copying data"
67 | if [ ! -d "${DATA_DIR}" ]; then
68 |     mkdir -p ${DATA_DIR}
69 | fi
70 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/all.* ${DATA_DIR}
71 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/news* ${DATA_DIR}
72 | 
73 | # create model output directory if it does not exist 
74 | if [ ! -d "${MODELS_DIR}" ]; then
75 |     mkdir -p ${MODELS_DIR}
76 | fi
77 | 
78 | # clone and install Theano
79 | echo "Cloning/installing Theano"
80 | mkdir -p ${THEANO_DIR}
81 | git clone ${THEANO_GIT} ${THEANO_DIR}
82 | cd ${THEANO_DIR}
83 | $PYTHON setup.py install --user
84 | 
85 | # check if theano is working
86 | $PYTHON -c "from __future__ import print_function; import theano; print 'theano available!'"
87 | 
88 | 


--------------------------------------------------------------------------------
/data/setup_local_env.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | # This script sets up development and data environments for 
  3 | # a local machine, copy under your home directory and run.
  4 | # Note that, Theano is NOT installed by this script.
  5 | # To use Byte Pair Encoding, simply pass -b argument.
  6 | 
  7 | BPE=false
  8 | 
  9 | while getopts ':b' flag; do
 10 |   case "${flag}" in
 11 |     b) BPE=true 
 12 |        echo "Using Byte Pair Encoding" ;;
 13 |     *) error 
 14 |        echo ""
 15 |        echo "Usage: $0 [-b]"
 16 |        echo ""
 17 |        exit 1 ;;
 18 |   esac
 19 | done
 20 | 
 21 | if [ -z $PYTHON ]; then
 22 |     if [ -n `which python3` ]; then
 23 |         export PYTHON=python3
 24 |     else
 25 |         if [ -n `which python2`]; then
 26 |             export PYTHON=python2
 27 |         else
 28 |             if [ -n `which python`]; then
 29 |                 export PYTHON=python
 30 |             fi
 31 |         fi
 32 |     fi 
 33 | fi
 34 | 
 35 | if [ -z $PYTHON ]; then
 36 |     echo "Please set PYTHON to a Python interpreter"
 37 |     exit 1 
 38 | fi
 39 | 
 40 | echo "Using $PYTHON"
 41 | 
 42 | # code directory for cloned repositories
 43 | SCRIPT_DIR=$( dirname "${BASH_SOURCE[0]}" )
 44 | CODE_DIR=${SCRIPT_DIR}/..
 45 | 
 46 | # code repository 
 47 | CODE_CENTRAL=https://github.com/nyu-dl/dl4mt-tutorial
 48 | 
 49 | # our input files will reside here
 50 | DATA_DIR=${CODE_DIR}/data
 51 | 
 52 | # our trained models will be saved here
 53 | MODELS_DIR=${HOME}/models
 54 | 
 55 | 
 56 | # clone the repository from github into code directory
 57 | if [ ! -d "${CODE_DIR}" ]; then
 58 |     echo "Cloning central ..."
 59 |     mkdir -p ${CODE_DIR}
 60 |     git clone ${CODE_CENTRAL} ${CODE_DIR}
 61 | fi
 62 | 
 63 | # download the europarl v7 and validation sets and extract
 64 | if [ ! -f ${DATA_DIR}/train_data.tgz ]; then 
 65 |     curl -o ${DATA_DIR}/train_data.tgz http://www.statmt.org/europarl/v7/fr-en.tgz
 66 | else
 67 |     echo "${DATA_DIR}/train_data.tgz exists"
 68 | fi
 69 | if [ ! -f ${DATA_DIR}/valid_data.tgz ]; then
 70 |     curl -o ${DATA_DIR}/valid_data.tgz http://matrix.statmt.org/test_sets/newstest2011.tgz
 71 | else
 72 |     echo "${DATA_DIR}/valid_data.tgz exists"
 73 | fi
 74 | $PYTHON ${CODE_DIR}/data/extract_files.py \
 75 |     -s='fr' -t='en' \
 76 |     --source-dev=newstest2011.fr \
 77 |     --target-dev=newstest2011.en \
 78 |     --outdir=${DATA_DIR}
 79 | 
 80 | if [ "$BPE" = true ] ; then
 81 | 
 82 |     BPE_DIR=${HOME}/codes/subword-nmt
 83 |     BPE_CENTRAL=https://github.com/rsennrich/subword-nmt
 84 | 
 85 |     # clone subword-nmt repository
 86 |     if [ ! -d "${BPE_DIR}" ]; then
 87 |         echo "Cloning BPE central ..."
 88 |         mkdir -p ${BPE_DIR}
 89 |         git clone ${BPE_CENTRAL} ${BPE_DIR}
 90 |     fi
 91 | 
 92 |     # follow the preprocessing pipeline for BPE
 93 |     ./preprocess.sh 'fr' 'en' ${DATA_DIR} ${BPE_DIR}
 94 | 
 95 | else
 96 | 
 97 |     # tokenize corresponding files
 98 |     perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/test2011/newstest2011.fr > ${DATA_DIR}/newstest2011.fr.tok
 99 |     perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/test2011/newstest2011.en > ${DATA_DIR}/newstest2011.en.tok
100 |     perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/europarl-v7.fr-en.fr > ${DATA_DIR}/europarl-v7.fr-en.fr.tok
101 |     perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/europarl-v7.fr-en.en > ${DATA_DIR}/europarl-v7.fr-en.en.tok
102 | 
103 |     # extract dictionaries
104 |     $PYTHON ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.fr.tok
105 |     $PYTHON ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.en.tok
106 | 
107 |     # shuffle traning data
108 |     $PYTHON ${CODE_DIR}/data/shuffle.py ${DATA_DIR}/europarl-v7.fr-en.en.tok ${DATA_DIR}/europarl-v7.fr-en.fr.tok 
109 | fi
110 | 
111 | # create model output directory if it does not exist 
112 | if [ ! -d "${MODELS_DIR}" ]; then
113 |     mkdir -p ${MODELS_DIR}
114 | fi
115 | 
116 | # check if theano is working
117 | $PYTHON -c "from __future__ import print_function; import theano; print('theano available!')"
118 | 


--------------------------------------------------------------------------------
/data/shuffle.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import sys
 5 | import random
 6 | 
 7 | from tempfile import mkstemp
 8 | from subprocess import call
 9 | 
10 | 
11 | 
12 | def main(files):
13 | 
14 |     tf_os, tpath = mkstemp()
15 |     tf = open(tpath, 'w')
16 | 
17 |     fds = [open(ff) for ff in files]
18 | 
19 |     for l in fds[0]:
20 |         lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
21 |         print("|||".join(lines), file=tf)
22 | 
23 |     [ff.close() for ff in fds]
24 |     tf.close()
25 | 
26 |     tf = open(tpath, 'r')
27 |     lines = tf.readlines()
28 |     random.shuffle(lines)
29 | 
30 |     fds = [open(ff+'.shuf','w') for ff in files]
31 | 
32 |     for l in lines:
33 |         s = l.strip().split('|||')
34 |         for ii, fd in enumerate(fds):
35 |             print(s[ii], file=fd)
36 | 
37 |     [ff.close() for ff in fds]
38 | 
39 |     os.remove(tpath)
40 | 
41 | if __name__ == '__main__':
42 |     main(sys.argv[1:])
43 | 
44 |     
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/data/strip_sgml.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | 
 7 | def main():
 8 |     fin = sys.stdin
 9 |     fout = sys.stdout
10 |     for l in fin:
11 |         line = l.strip()
12 |         text = re.sub('<[^<]+>', "", line).strip()
13 |         if len(text) == 0:
14 |             continue
15 |         print(text, file=fout)
16 |                 
17 | 
18 | if __name__ == "__main__":
19 |     main()
20 | 
21 | 


--------------------------------------------------------------------------------
/data/tokenize_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for F in `ls ./training/* | grep -v pkl | grep -v tok`
 4 | do
 5 |     echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok"
 6 |     perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok
 7 | done
 8 | 
 9 | for F in `ls ./dev/*.?? | grep -v tok`
10 | do
11 |     echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok"
12 |     perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok
13 | done
14 | 


--------------------------------------------------------------------------------
/data/tokenizer.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | use warnings;
  7 | 
  8 | # Sample Tokenizer
  9 | ### Version 1.1
 10 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
 11 | # Version 1.1 updates:
 12 | #       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
 13 | #       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
 14 | #       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
 15 | ### Version 1.0
 16 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
 17 | # written by Josh Schroeder, based on code by Philipp Koehn
 18 | 
 19 | binmode(STDIN, ":utf8");
 20 | binmode(STDOUT, ":utf8");
 21 | 
 22 | use warnings;
 23 | use FindBin qw($RealBin);
 24 | use strict;
 25 | use Time::HiRes;
 26 | 
 27 | if  (eval {require Thread;1;}) {
 28 |   #module loaded
 29 |   Thread->import();
 30 | }
 31 | 
 32 | my $mydir = "$RealBin/nonbreaking_prefixes";
 33 | 
 34 | my %NONBREAKING_PREFIX = ();
 35 | my @protected_patterns = ();
 36 | my $protected_patterns_file = "";
 37 | my $language = "en";
 38 | my $QUIET = 0;
 39 | my $HELP = 0;
 40 | my $AGGRESSIVE = 0;
 41 | my $SKIP_XML = 0;
 42 | my $TIMING = 0;
 43 | my $NUM_THREADS = 1;
 44 | my $NUM_SENTENCES_PER_THREAD = 2000;
 45 | my $PENN = 0;
 46 | my $NO_ESCAPING = 0;
 47 | while (@ARGV)
 48 | {
 49 | 	$_ = shift;
 50 | 	/^-b$/ && ($| = 1, next);
 51 | 	/^-l$/ && ($language = shift, next);
 52 | 	/^-q$/ && ($QUIET = 1, next);
 53 | 	/^-h$/ && ($HELP = 1, next);
 54 | 	/^-x$/ && ($SKIP_XML = 1, next);
 55 | 	/^-a$/ && ($AGGRESSIVE = 1, next);
 56 | 	/^-time$/ && ($TIMING = 1, next);
 57 |   # Option to add list of regexps to be protected
 58 |   /^-protected/ && ($protected_patterns_file = shift, next);
 59 | 	/^-threads$/ && ($NUM_THREADS = int(shift), next);
 60 | 	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
 61 | 	/^-penn$/ && ($PENN = 1, next);
 62 | 	/^-no-escape/ && ($NO_ESCAPING = 1, next);
 63 | }
 64 | 
 65 | # for time calculation
 66 | my $start_time;
 67 | if ($TIMING)
 68 | {
 69 |     $start_time = [ Time::HiRes::gettimeofday( ) ];
 70 | }
 71 | 
 72 | # print help message
 73 | if ($HELP)
 74 | {
 75 | 	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
 76 |         print "Options:\n";
 77 |         print "  -q     ... quiet.\n";
 78 |         print "  -a     ... aggressive hyphen splitting.\n";
 79 |         print "  -b     ... disable Perl buffering.\n";
 80 |         print "  -time  ... enable processing time calculation.\n";
 81 |         print "  -penn  ... use Penn treebank-like tokenization.\n";
 82 |         print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
 83 | 	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
 84 | 	exit;
 85 | }
 86 | 
 87 | if (!$QUIET)
 88 | {
 89 | 	print STDERR "Tokenizer Version 1.1\n";
 90 | 	print STDERR "Language: $language\n";
 91 | 	print STDERR "Number of threads: $NUM_THREADS\n";
 92 | }
 93 | 
 94 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
 95 | load_prefixes($language,\%NONBREAKING_PREFIX);
 96 | 
 97 | if (scalar(%NONBREAKING_PREFIX) eq 0)
 98 | {
 99 | 	print STDERR "Warning: No known abbreviations for language '$language'\n";
100 | }
101 | 
102 | # Load protected patterns
103 | if ($protected_patterns_file)
104 | {
105 |   open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
106 |   while(<PP>) {
107 |     chomp;
108 |     push @protected_patterns, $_;
109 |   }
110 | }
111 | 
112 | my @batch_sentences = ();
113 | my @thread_list = ();
114 | my $count_sentences = 0;
115 | 
116 | if ($NUM_THREADS > 1)
117 | {# multi-threading tokenization
118 |     while(<STDIN>)
119 |     {
120 |         $count_sentences = $count_sentences + 1;
121 |         push(@batch_sentences, $_);
122 |         if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
123 |         {
124 |             # assign each thread work
125 |             for (my $i=0; $i<$NUM_THREADS; $i++)
126 |             {
127 |                 my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
128 |                 my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
129 |                 my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
130 |                 my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
131 |                 push(@thread_list, $new_thread);
132 |             }
133 |             foreach (@thread_list)
134 |             {
135 |                 my $tokenized_list = $_->join;
136 |                 foreach (@$tokenized_list)
137 |                 {
138 |                     print $_;
139 |                 }
140 |             }
141 |             # reset for the new run
142 |             @thread_list = ();
143 |             @batch_sentences = ();
144 |         }
145 |     }
146 |     # the last batch
147 |     if (scalar(@batch_sentences)>0)
148 |     {
149 |         # assign each thread work
150 |         for (my $i=0; $i<$NUM_THREADS; $i++)
151 |         {
152 |             my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
153 |             if ($start_index >= scalar(@batch_sentences))
154 |             {
155 |                 last;
156 |             }
157 |             my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
158 |             if ($end_index >= scalar(@batch_sentences))
159 |             {
160 |                 $end_index = scalar(@batch_sentences)-1;
161 |             }
162 |             my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
163 |             my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
164 |             push(@thread_list, $new_thread);
165 |         }
166 |         foreach (@thread_list)
167 |         {
168 |             my $tokenized_list = $_->join;
169 |             foreach (@$tokenized_list)
170 |             {
171 |                 print $_;
172 |             }
173 |         }
174 |     }
175 | }
176 | else
177 | {# single thread only
178 |     while(<STDIN>)
179 |     {
180 |         if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
181 |         {
182 |             #don't try to tokenize XML/HTML tag lines
183 |             print $_;
184 |         }
185 |         else
186 |         {
187 |             print &tokenize($_);
188 |         }
189 |     }
190 | }
191 | 
192 | if ($TIMING)
193 | {
194 |     my $duration = Time::HiRes::tv_interval( $start_time );
195 |     print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
196 |     print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
197 | }
198 | 
199 | #####################################################################################
200 | # subroutines afterward
201 | 
202 | # tokenize a batch of texts saved in an array
203 | # input: an array containing a batch of texts
204 | # return: another array containing a batch of tokenized texts for the input array
205 | sub tokenize_batch
206 | {
207 |     my(@text_list) = @_;
208 |     my(@tokenized_list) = ();
209 |     foreach (@text_list)
210 |     {
211 |         if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
212 |         {
213 |             #don't try to tokenize XML/HTML tag lines
214 |             push(@tokenized_list, $_);
215 |         }
216 |         else
217 |         {
218 |             push(@tokenized_list, &tokenize($_));
219 |         }
220 |     }
221 |     return \@tokenized_list;
222 | }
223 | 
224 | # the actual tokenize function which tokenizes one input string
225 | # input: one string
226 | # return: the tokenized string for the input string
227 | sub tokenize
228 | {
229 |     my($text) = @_;
230 | 
231 |     if ($PENN) {
232 |       return tokenize_penn($text);
233 |     }
234 | 
235 |     chomp($text);
236 |     $text = " $text ";
237 | 
238 |     # remove ASCII junk
239 |     $text =~ s/\s+/ /g;
240 |     $text =~ s/[\000-\037]//g;
241 | 
242 |     # Find protected patterns
243 |     my @protected = ();
244 |     foreach my $protected_pattern (@protected_patterns) {
245 |       my $t = $text;
246 |       while ($t =~ /($protected_pattern)(.*)$/) {
247 |         push @protected, $1;
248 |         $t = $2;
249 |       }
250 |     }
251 | 
252 |     for (my $i = 0; $i < scalar(@protected); ++$i) {
253 |       my $subst = sprintf("THISISPROTECTED%.3d", $i);
254 |       $text =~ s,\Q$protected[$i], $subst ,g;
255 |     }
256 |     $text =~ s/ +/ /g;
257 |     $text =~ s/^ //g;
258 |     $text =~ s/ $//g;
259 | 
260 |     # seperate out all "other" special characters
261 |     $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
262 | 
263 |     # aggressive hyphen splitting
264 |     if ($AGGRESSIVE)
265 |     {
266 |         $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
267 |     }
268 | 
269 |     #multi-dots stay together
270 |     $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
271 |     while($text =~ /DOTMULTI\./)
272 |     {
273 |         $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
274 |         $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
275 |     }
276 | 
277 |     # seperate out "," except if within numbers (5,300)
278 |     #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
279 | 
280 |     # separate out "," except if within numbers (5,300)
281 |     # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
282 |     # first application uses up B so rule can't see B,C
283 |     # two-step version here may create extra spaces but these are removed later
284 |     # will also space digit,letter or letter,digit forms (redundant with next section)
285 |     $text =~ s/([^\p{IsN}])[,]/$1 , /g;
286 |     $text =~ s/[,]([^\p{IsN}])/ , $1/g;
287 | 
288 |     # separate , pre and post number
289 |     #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
290 |     #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
291 | 
292 |     # turn `into '
293 |     #$text =~ s/\`/\'/g;
294 | 
295 |     #turn '' into "
296 |     #$text =~ s/\'\'/ \" /g;
297 | 
298 |     if ($language eq "en")
299 |     {
300 |         #split contractions right
301 |         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
302 |         $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
303 |         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
304 |         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
305 |         #special case for "1990's"
306 |         $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
307 |     }
308 |     elsif (($language eq "fr") or ($language eq "it"))
309 |     {
310 |         #split contractions left
311 |         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
312 |         $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
313 |         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
314 |         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
315 |     }
316 |     else
317 |     {
318 |         $text =~ s/\'/ \' /g;
319 |     }
320 | 
321 |     #word token method
322 |     my @words = split(/\s/,$text);
323 |     $text = "";
324 |     for (my $i=0;$i<(scalar(@words));$i++)
325 |     {
326 |         my $word = $words[$i];
327 |         if ( $word =~ /^(\S+)\.$/)
328 |         {
329 |             my $pre = $1;
330 |             if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
331 |             {
332 |                 #no change
333 | 			}
334 |             elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
335 |             {
336 |                 #no change
337 |             }
338 |             else
339 |             {
340 |                 $word = $pre." .";
341 |             }
342 |         }
343 |         $text .= $word." ";
344 |     }
345 | 
346 |     # clean up extraneous spaces
347 |     $text =~ s/ +/ /g;
348 |     $text =~ s/^ //g;
349 |     $text =~ s/ $//g;
350 | 
351 |     # restore protected
352 |     for (my $i = 0; $i < scalar(@protected); ++$i) {
353 |       my $subst = sprintf("THISISPROTECTED%.3d", $i);
354 |       $text =~ s/$subst/$protected[$i]/g;
355 |     }
356 | 
357 |     #restore multi-dots
358 |     while($text =~ /DOTDOTMULTI/)
359 |     {
360 |         $text =~ s/DOTDOTMULTI/DOTMULTI./g;
361 |     }
362 |     $text =~ s/DOTMULTI/./g;
363 | 
364 |     #escape special chars
365 |     if (!$NO_ESCAPING)
366 |       {
367 | 	$text =~ s/\&/\&amp;/g;   # escape escape
368 | 	$text =~ s/\|/\&#124;/g;  # factor separator
369 | 	$text =~ s/\</\&lt;/g;    # xml
370 | 	$text =~ s/\>/\&gt;/g;    # xml
371 | 	$text =~ s/\'/\&apos;/g;  # xml
372 | 	$text =~ s/\"/\&quot;/g;  # xml
373 | 	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
374 | 	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
375 |       }
376 | 
377 |     #ensure final line break
378 |     $text .= "\n" unless $text =~ /\n$/;
379 | 
380 |     return $text;
381 | }
382 | 
383 | sub tokenize_penn
384 | {
385 |     # Improved compatibility with Penn Treebank tokenization.  Useful if
386 |     # the text is to later be parsed with a PTB-trained parser.
387 |     #
388 |     # Adapted from Robert MacIntyre's sed script:
389 |     #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
390 | 
391 |     my($text) = @_;
392 |     chomp($text);
393 | 
394 |     # remove ASCII junk
395 |     $text =~ s/\s+/ /g;
396 |     $text =~ s/[\000-\037]//g;
397 | 
398 |     # attempt to get correct directional quotes
399 |     $text =~ s/^``/`` /g;
400 |     $text =~ s/^"/`` /g;
401 |     $text =~ s/^`([^`])/` $1/g;
402 |     $text =~ s/^'/`  /g;
403 |     $text =~ s/([ ([{<])"/$1 `` /g;
404 |     $text =~ s/([ ([{<])``/$1 `` /g;
405 |     $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
406 |     $text =~ s/([ ([{<])'/$1 ` /g;
407 |     # close quotes handled at end
408 | 
409 |     $text =~ s=\.\.\.= _ELLIPSIS_ =g;
410 | 
411 |     # separate out "," except if within numbers (5,300)
412 |     $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
413 |     # separate , pre and post number
414 |     $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
415 |     $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
416 | 
417 |     #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
418 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
419 | 
420 |     # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
421 |     # the tokens should be merged prior to parsing with a PTB-trained parser
422 |     # (see syntax-hyphen-splitting.perl).
423 |     $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
424 | 
425 |     # Assume sentence tokenization has been done first, so split FINAL periods
426 |     # only.
427 |     $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
428 |     # however, we may as well split ALL question marks and exclamation points,
429 |     # since they shouldn't have the abbrev.-marker ambiguity problem
430 |     $text =~ s=([?!])= $1 =g;
431 | 
432 |     # parentheses, brackets, etc.
433 |     $text =~ s=([\]\[\(\){}<>])= $1 =g;
434 |     $text =~ s/\(/-LRB-/g;
435 |     $text =~ s/\)/-RRB-/g;
436 |     $text =~ s/\[/-LSB-/g;
437 |     $text =~ s/\]/-RSB-/g;
438 |     $text =~ s/{/-LCB-/g;
439 |     $text =~ s/}/-RCB-/g;
440 | 
441 |     $text =~ s=--= -- =g;
442 | 
443 |     # First off, add a space to the beginning and end of each line, to reduce
444 |     # necessary number of regexps.
445 |     $text =~ s=$= =;
446 |     $text =~ s=^= =;
447 | 
448 |     $text =~ s="= '' =g;
449 |     # possessive or close-single-quote
450 |     $text =~ s=([^'])' =$1 ' =g;
451 |     # as in it's, I'm, we'd
452 |     $text =~ s='([sSmMdD]) = '$1 =g;
453 |     $text =~ s='ll = 'll =g;
454 |     $text =~ s='re = 're =g;
455 |     $text =~ s='ve = 've =g;
456 |     $text =~ s=n't = n't =g;
457 |     $text =~ s='LL = 'LL =g;
458 |     $text =~ s='RE = 'RE =g;
459 |     $text =~ s='VE = 'VE =g;
460 |     $text =~ s=N'T = N'T =g;
461 | 
462 |     $text =~ s= ([Cc])annot = $1an not =g;
463 |     $text =~ s= ([Dd])'ye = $1' ye =g;
464 |     $text =~ s= ([Gg])imme = $1im me =g;
465 |     $text =~ s= ([Gg])onna = $1on na =g;
466 |     $text =~ s= ([Gg])otta = $1ot ta =g;
467 |     $text =~ s= ([Ll])emme = $1em me =g;
468 |     $text =~ s= ([Mm])ore'n = $1ore 'n =g;
469 |     $text =~ s= '([Tt])is = '$1 is =g;
470 |     $text =~ s= '([Tt])was = '$1 was =g;
471 |     $text =~ s= ([Ww])anna = $1an na =g;
472 | 
473 |     #word token method
474 |     my @words = split(/\s/,$text);
475 |     $text = "";
476 |     for (my $i=0;$i<(scalar(@words));$i++)
477 |     {
478 |         my $word = $words[$i];
479 |         if ( $word =~ /^(\S+)\.$/)
480 |         {
481 |             my $pre = $1;
482 |             if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
483 |             {
484 |                 #no change
485 |             }
486 |             elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
487 |             {
488 |                 #no change
489 |             }
490 |             else
491 |             {
492 |                 $word = $pre." .";
493 |             }
494 |         }
495 |         $text .= $word." ";
496 |     }
497 | 
498 |     # restore ellipses
499 |     $text =~ s=_ELLIPSIS_=\.\.\.=g;
500 | 
501 |     # clean out extra spaces
502 |     $text =~ s=  *= =g;
503 |     $text =~ s=^ *==g;
504 |     $text =~ s= *$==g;
505 | 
506 |     #escape special chars
507 |     $text =~ s/\&/\&amp;/g;   # escape escape
508 |     $text =~ s/\|/\&#124;/g;  # factor separator
509 |     $text =~ s/\</\&lt;/g;    # xml
510 |     $text =~ s/\>/\&gt;/g;    # xml
511 |     $text =~ s/\'/\&apos;/g;  # xml
512 |     $text =~ s/\"/\&quot;/g;  # xml
513 |     $text =~ s/\[/\&#91;/g;   # syntax non-terminal
514 |     $text =~ s/\]/\&#93;/g;   # syntax non-terminal
515 | 
516 |     #ensure final line break
517 |     $text .= "\n" unless $text =~ /\n$/;
518 | 
519 |     return $text;
520 | }
521 | 
522 | sub load_prefixes
523 | {
524 |     my ($language, $PREFIX_REF) = @_;
525 | 
526 |     my $prefixfile = "$mydir/nonbreaking_prefix.$language";
527 | 
528 |     #default back to English if we don't have a language-specific prefix file
529 |     if (!(-e $prefixfile))
530 |     {
531 |         $prefixfile = "$mydir/nonbreaking_prefix.en";
532 |         print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
533 |         die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
534 |     }
535 | 
536 |     if (-e "$prefixfile")
537 |     {
538 |         open(PREFIX, "<:utf8", "$prefixfile");
539 |         while (<PREFIX>)
540 |         {
541 |             my $item = $_;
542 |             chomp($item);
543 |             if (($item) && (substr($item,0,1) ne "#"))
544 |             {
545 |                 if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
546 |                 {
547 |                     $PREFIX_REF->{$1} = 2;
548 |                 }
549 |                 else
550 |                 {
551 |                     $PREFIX_REF->{$item} = 1;
552 |                 }
553 |             }
554 |         }
555 |         close(PREFIX);
556 |     }
557 | }
558 | 


--------------------------------------------------------------------------------
/docs/cgru.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-tutorial/c476acbe325b8f2709d64649a052573219410fae/docs/cgru.pdf


--------------------------------------------------------------------------------
/docs/cgru.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt, oneside]{article} 
  2 | \usepackage{geometry}                		
  3 | %\geometry{letterpaper}                   		
  4 | \usepackage[parfill]{parskip}
  5 | \usepackage{graphicx}				
  6 | \usepackage{amssymb}
  7 | \usepackage{amsthm}
  8 | \usepackage{mathtools}
  9 | \usepackage{enumerate}
 10 | 
 11 | \theoremstyle{definition}
 12 | \newtheorem*{ans*}{Answer}
 13 | 
 14 | \newcommand{\obs}{\text{obs}}
 15 | \newcommand{\mis}{\text{mis}}
 16 | 
 17 | \newcommand{\qt}[1]{\left<#1\right>}
 18 | \newcommand{\ql}[1]{\left[#1\right]}
 19 | \newcommand{\hess}{\mathbf{H}}
 20 | \newcommand{\jacob}{\mathbf{J}}
 21 | %\newcommand{\hl}{HL}
 22 | \newcommand{\cost}{\mathcal{L}}
 23 | \newcommand{\lout}{\mathbf{r}}
 24 | \newcommand{\louti}{r}
 25 | \newcommand{\outi}{y}
 26 | \newcommand{\out}{\mathbf{y}}
 27 | \newcommand{\gauss}{\mathbf{G_N}}
 28 | \newcommand{\eye}{\mathbf{I}}
 29 | \newcommand{\softmax}{\phi}
 30 | \newcommand{\targ}{\mathbf{t}}
 31 | \newcommand{\metric}{\mathbf{G}}
 32 | \newcommand{\sample}{\mathbf{z}}
 33 | \newcommand{\f}{\text{f}}
 34 | %\newcommand{\log}{\text{log}}
 35 | 
 36 | \newcommand{\bmx}[0]{\begin{bmatrix}}
 37 | \newcommand{\emx}[0]{\end{bmatrix}}
 38 | \newcommand{\qexp}[1]{\left<#1\right>}
 39 | \newcommand{\vect}[1]{\mathbf{#1}}
 40 | \newcommand{\vects}[1]{\boldsymbol{#1}}
 41 | \newcommand{\matr}[1]{\mathbf{#1}}
 42 | \newcommand{\var}[0]{\operatorname{Var}}
 43 | \newcommand{\std}[0]{\operatorname{std}}
 44 | \newcommand{\cov}[0]{\operatorname{Cov}}
 45 | \newcommand{\diag}[0]{\operatorname{diag}}
 46 | \newcommand{\matrs}[1]{\boldsymbol{#1}}
 47 | \newcommand{\va}[0]{\vect{a}}
 48 | \newcommand{\vb}[0]{\vect{b}}
 49 | \newcommand{\vc}[0]{\vect{c}}
 50 | \newcommand{\ve}[0]{\vect{e}}
 51 | 
 52 | \newcommand{\vh}[0]{\vect{h}}
 53 | \newcommand{\vv}[0]{\vect{v}}
 54 | \newcommand{\vx}[0]{\vect{x}}
 55 | \newcommand{\vz}[0]{\vect{z}}
 56 | \newcommand{\vw}[0]{\vect{w}}
 57 | \newcommand{\vs}[0]{\vect{s}}
 58 | \newcommand{\vf}[0]{\vect{f}}
 59 | \newcommand{\vi}[0]{\vect{i}}
 60 | \newcommand{\vo}[0]{\vect{o}}
 61 | \newcommand{\vy}[0]{\vect{y}}
 62 | \newcommand{\vg}[0]{\vect{g}}
 63 | \newcommand{\vm}[0]{\vect{m}}
 64 | \newcommand{\vu}[0]{\vect{u}}
 65 | \newcommand{\vL}[0]{\vect{L}}
 66 | \newcommand{\vr}[0]{\vect{r}}
 67 | \newcommand{\vp}[0]{\vect{p}}
 68 | \newcommand{\mW}[0]{\matr{W}}
 69 | \newcommand{\mP}[0]{\matr{P}}
 70 | 
 71 | \newcommand{\mE}[0]{\matr{E}}
 72 | \newcommand{\mG}[0]{\matr{G}}
 73 | \newcommand{\mX}[0]{\matr{X}}
 74 | \newcommand{\mQ}[0]{\matr{Q}}
 75 | \newcommand{\mU}[0]{\matr{U}}
 76 | \newcommand{\mF}[0]{\matr{F}}
 77 | \newcommand{\mV}[0]{\matr{V}}
 78 | \newcommand{\mA}{\matr{A}}
 79 | \newcommand{\mC}{\matr{C}}
 80 | \newcommand{\mD}{\matr{D}}
 81 | \newcommand{\mS}{\matr{S}}
 82 | \newcommand{\mI}{\matr{I}}
 83 | \newcommand{\td}[0]{\text{d}}
 84 | \newcommand{\TT}[0]{\vects{\theta}}
 85 | \newcommand{\vsig}[0]{\vects{\sigma}}
 86 | \newcommand{\valpha}[0]{\vects{\alpha}}
 87 | \newcommand{\vmu}[0]{\vects{\mu}}
 88 | \newcommand{\vzero}[0]{\vect{0}}
 89 | \newcommand{\tf}[0]{\text{m}}
 90 | \newcommand{\tdf}[0]{\text{dm}}
 91 | \newcommand{\grad}[0]{\nabla}
 92 | \newcommand{\alert}[1]{\textcolor{red}{#1}}
 93 | \newcommand{\N}[0]{\mathcal{N}}
 94 | \newcommand{\LL}[0]{\mathcal{L}}
 95 | \newcommand{\HH}[0]{\mathcal{H}}
 96 | \newcommand{\RR}[0]{\mathbb{R}}
 97 | \newcommand{\II}[0]{\mathbb{I}}
 98 | \newcommand{\Scal}[0]{\mathcal{S}}
 99 | \newcommand{\sigmoid}{\sigma}
100 | \newcommand{\E}[0]{\mathbb{E}}
101 | \newcommand{\enabla}[0]{\ensuremath{%
102 |     \overset{\raisebox{-0.3ex}[0.5ex][0ex]{%
103 |     \ensuremath{\scriptscriptstyle e}}}{\nabla}}}
104 | \newcommand{\enhnabla}[0]{\nabla_{\hspace{-0.5mm}e}\,}
105 | 
106 | 
107 | \newcommand{\todo}[1]{{\Large\textcolor{red}{#1}}}
108 | \newcommand{\done}[1]{{\Large\textcolor{green}{#1}}}
109 | \newcommand{\dd}[1]{\ensuremath{\mbox{d}#1}}
110 | 
111 | \DeclareMathOperator*{\argmax}{\arg \max}
112 | \DeclareMathOperator*{\argmin}{\arg \min}
113 | \newcommand{\newln}{\\&\quad\quad{}}
114 | 
115 | \newcommand{\Ax}{\mathcal{A}_x}
116 | \newcommand{\Ay}{\mathcal{A}_y}
117 | \newcommand{\ola}{\overleftarrow}
118 | \newcommand{\ora}{\overrightarrow}
119 | \newcommand{\ov}{\overline}
120 | \newcommand{\ts}{\rule{0pt}{2.6ex}}       % Top strut
121 | \newcommand{\ms}{\rule{0pt}{0ex}}         % Middle strut
122 | \newcommand{\bs}{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
123 | \newcommand{\specialcell}[2][c]{%
124 |   \begin{tabular}[#1]{@{}c@{}}#2\end{tabular}}
125 | 
126 | \newcommand\codeHighlight[1]{\textcolor[rgb]{1,0,0}{#1}}
127 | 
128 | \title{DL4MT-Tutorial: \\Conditional Gated Recurrent Unit with Attention Mechanism}
129 | \author{Orhan Firat \and Kyunghyun Cho}
130 | \date{May 15, 2016}
131 | 
132 | \begin{document}
133 | 
134 | \maketitle
135 | 
136 | This document describes the $gru\_cond\_layer$ used in Session 2 and Session 3.
137 | 
138 | Given a source sequence $(x_1, \dots,x_{T_x})$ of length $T_x$ and a target
139 | sequence $(y_1,\dots,y_{T_y})$, let $\vh_i$ be the annotation of the source symbol 
140 | at position $i$, obtained by concatenating the forward and backward encoder RNN 
141 | hidden states, $\vh_i = [ \ora{\vh}_i; \ola{\vh}_i ]$. A conditional GRU with attention 
142 | mechanism, cGRU$_{\text{att}}$, uses it's previous hidden state $\vs_{j-1}$, the 
143 | whole set of source annotations $\text{C}=\lbrace\vh_i, \dots, \vh_{T_x}\rbrace$ and 
144 | the previously decoded symbol $y_{j-1}$ in order to update it's hidden state $\vs_j$, 
145 | which is further used to decode symbol $y_j$ at position $j$,
146 | 
147 | \begin{equation}
148 | 	\vs_j = \text{cGRU}_{\text{att}}\left(  \vs_{j-1}, y_{j-1}, \text{C}  \right).
149 | \end{equation}
150 | 
151 | \paragraph{Internals} The conditional GRU layer with attention mechanism, 
152 | cGRU$_{\text{att}}$, consists of three components, two 
153 | recurrent cells and an attention mechanism ATT in between. 
154 | First recurrent cell $\text{REC}_1$, combines the previous decoded symbol $y_{j-1}$ 
155 | and previous hidden state $\vs_{j-1}$ in order to generate an intermediate 
156 | representation $\vs^{\prime}_j$ with the following formulations:
157 | 
158 | \vspace{-10px}
159 | \begin{align}
160 | 	\vs_j^{\prime} = \text{REC}_1 &  \left( y_{j-1}, \vs_{j-1}  \right) = (1 - \vz_j^{\prime}) \odot \underline{\vs}_j^{\prime} + \vz_j^{\prime} \odot \vs_{j-1},	 \\
161 | 	\underline{\vs}_j^{\prime} =& ~\text{tanh} \left(   \mW^{\prime} \mE[y_{j-1}] + \vr_j^{\prime} \odot (\mU^{\prime}\vs_{j-1})  \right), \\
162 | 	\vr_j^{\prime} =& ~ \sigma \left(  \mW_r^{\prime} \mE[y_{j-1}] + \mU_r^{\prime} \vs_{j-1}  \right), \\
163 | 	\vz_j^{\prime} =& ~ \sigma \left(  \mW_z^{\prime} \mE[y_{j-1}] + \mU_z^{\prime} \vs_{j-1}  \right),
164 | \end{align}
165 | 
166 | \noindent where $\mE$ is the target word embedding matrix, 
167 | $\underline{\vs}_j^{\prime}$ is the proposal intermediate representation, $\vr_j^{\prime}$ 
168 | and $\vz_j^{\prime}$ being the reset and update gate activations. In this formulation, 
169 | $\mW^{\prime}$, $\mU^{\prime}$, $\mW_r^{\prime}$, $\mU_r^{\prime}$, 
170 | $\mW_z^{\prime}$, $\mU_z^{\prime}$ are trained model parameters\footnote{All 
171 | the biases are omitted for simplicity.} tanh and $\sigma$ are hyperbolic tangent 
172 | and logistic sigmoid activation functions respectively.
173 | 
174 | The attention mechanism ATT, inputs the entire context set C along with 
175 | intermediate hidden state $\vs_j^{\prime}$ in order to compute the context vector 
176 | $\vc_j$ as follows:
177 | 
178 | \begin{align}
179 | 	\vc_j =& \text{ATT} \left(  \text{C}, \vs_j^{\prime}  \right) = \sum_i^{T_x} \alpha_{ij} \vh_i	,	\\
180 | 	 \alpha_{ij} &  = \frac{\text{exp}(e_{ij})}{\sum_{k=1}^{Tx} \text{exp}(e_{kj}) } 	,\\
181 | 	e_{ij} =& \vv_a^{\intercal} \text{tanh} \left( \mU_a \vs_j^{(1)} + \mW_a \vh_i \right) ,	
182 | \end{align}
183 | 
184 | \noindent where $\alpha_{ij}$ is the normalized alignment weight between source 
185 | symbol at position $i$ and target symbol at position $j$ and $\vv_a, \mU_a, \mW_a$ 
186 | are the trained model parameters.
187 | 
188 | Finally, the second recurrent cell $\text{REC}_2$, generates $\vs_j$, the hidden state of 
189 | the $\text{cGRU}_{\text{att}}$, by looking at intermediate representation  
190 | $\vs_j^{\prime}$ and context vector $\vc_j$ with the following formulations:
191 | 
192 | \begin{align}
193 | 	\vs_j = \text{REC}_2 & \left(  \vs_j^{\prime}, \vc_j  \right) = (1 - \vz_j) \odot \underline{\vs}_j + \vz_j \odot \vs_j^{\prime},	 \\
194 | 	\underline{\vs}_j =& \text{tanh} \left(  \mW \vc_j  + \vr_j \odot (\mU \vs_j^{\prime} )  \right) ,\\
195 | 	\vr_j =& \sigma \left( \mW_r \vc_j + \mU_r \vs_j^{\prime} \right), \\
196 | 	\vz_j =& \sigma \left( \mW_z \vc_j + \mU_z \vs_j^{\prime} \right),
197 | \end{align}
198 | 
199 | \noindent similarly, $\underline{\vs}_j$ being the proposal hidden state, 
200 | $\vr_j$ and $\vz_j$ being the reset and update gate activations with the 
201 | trained model parameters $\mW, \mU, \mW_r, \mU_r, 
202 | \mW_z, \mU_z$.
203 | 
204 | \end{document} 
205 | 


--------------------------------------------------------------------------------
/session0/data_iterator.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pkl
 2 | import gzip
 3 | 
 4 | 
 5 | class TextIterator:
 6 |     def __init__(self, source,
 7 |                  source_dict,
 8 |                  batch_size=128,
 9 |                  maxlen=100,
10 |                  n_words_source=-1):
11 |         if source.endswith('.gz'):
12 |             self.source = gzip.open(source, 'r')
13 |         else:
14 |             self.source = open(source, 'r')
15 |         with open(source_dict, 'rb') as f:
16 |             self.source_dict = pkl.load(f)
17 | 
18 |         self.batch_size = batch_size
19 |         self.maxlen = maxlen
20 | 
21 |         self.n_words_source = n_words_source
22 | 
23 |         self.end_of_data = False
24 | 
25 |     def __iter__(self):
26 |         return self
27 | 
28 |     def reset(self):
29 |         self.source.seek(0)
30 | 
31 |     def next(self):
32 |         if self.end_of_data:
33 |             self.end_of_data = False
34 |             self.reset()
35 |             raise StopIteration
36 | 
37 |         source = []
38 | 
39 |         try:
40 | 
41 |             # actual work here
42 |             while True:
43 |                 ss = self.source.readline()
44 |                 if ss == "":
45 |                     raise IOError
46 |                 ss = ss.strip().split()
47 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
48 |                       for w in ss]
49 |                 if self.n_words_source > 0:
50 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
51 | 
52 |                 if len(ss) > self.maxlen:
53 |                     continue
54 | 
55 |                 source.append(ss)
56 | 
57 |                 if len(source) >= self.batch_size:
58 |                     break
59 |         except IOError:
60 |             self.end_of_data = True
61 | 
62 |         if len(source) <= 0:
63 |             self.end_of_data = False
64 |             self.reset()
65 |             raise StopIteration
66 | 
67 |         return source
68 | 


--------------------------------------------------------------------------------
/session0/lm.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Build a simple neural language model using GRU units
  3 | '''
  4 | import theano
  5 | import theano.tensor as tensor
  6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  7 | 
  8 | import cPickle as pkl
  9 | import ipdb
 10 | import numpy
 11 | import copy
 12 | 
 13 | import os
 14 | import warnings
 15 | import sys
 16 | import time
 17 | 
 18 | from collections import OrderedDict
 19 | 
 20 | from data_iterator import TextIterator
 21 | 
 22 | profile = False
 23 | 
 24 | 
 25 | # push parameters to Theano shared variables
 26 | def zipp(params, tparams):
 27 |     for kk, vv in params.iteritems():
 28 |         tparams[kk].set_value(vv)
 29 | 
 30 | 
 31 | # pull parameters from Theano shared variables
 32 | def unzip(zipped):
 33 |     new_params = OrderedDict()
 34 |     for kk, vv in zipped.iteritems():
 35 |         new_params[kk] = vv.get_value()
 36 |     return new_params
 37 | 
 38 | 
 39 | # get the list of parameters: Note that tparams must be OrderedDict
 40 | def itemlist(tparams):
 41 |     return [vv for kk, vv in tparams.iteritems()]
 42 | 
 43 | 
 44 | # dropout
 45 | def dropout_layer(state_before, use_noise, trng):
 46 |     proj = tensor.switch(
 47 |         use_noise,
 48 |         state_before * trng.binomial(state_before.shape, p=0.5, n=1,
 49 |                                      dtype=state_before.dtype),
 50 |         state_before * 0.5)
 51 |     return proj
 52 | 
 53 | 
 54 | # make prefix-appended name
 55 | def _p(pp, name):
 56 |     return '%s_%s' % (pp, name)
 57 | 
 58 | 
 59 | # initialize Theano shared variables according to the initial parameters
 60 | def init_tparams(params):
 61 |     tparams = OrderedDict()
 62 |     for kk, pp in params.iteritems():
 63 |         tparams[kk] = theano.shared(params[kk], name=kk)
 64 |     return tparams
 65 | 
 66 | 
 67 | # load parameters
 68 | def load_params(path, params):
 69 |     pp = numpy.load(path)
 70 |     for kk, vv in params.iteritems():
 71 |         if kk not in pp:
 72 |             warnings.warn('%s is not in the archive' % kk)
 73 |             continue
 74 |         params[kk] = pp[kk]
 75 | 
 76 |     return params
 77 | 
 78 | 
 79 | # layers: 'name': ('parameter initializer', 'feedforward')
 80 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
 81 |           'gru': ('param_init_gru', 'gru_layer'),
 82 |           }
 83 | 
 84 | 
 85 | def get_layer(name):
 86 |     fns = layers[name]
 87 |     return (eval(fns[0]), eval(fns[1]))
 88 | 
 89 | 
 90 | # orthogonal initialization for weights
 91 | # see Saxe et al. ICLR'14
 92 | def ortho_weight(ndim):
 93 |     W = numpy.random.randn(ndim, ndim)
 94 |     u, s, v = numpy.linalg.svd(W)
 95 |     return u.astype('float32')
 96 | 
 97 | 
 98 | # weight initializer, normal by default
 99 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
100 |     if nout is None:
101 |         nout = nin
102 |     if nout == nin and ortho:
103 |         W = ortho_weight(nin)
104 |     else:
105 |         W = scale * numpy.random.randn(nin, nout)
106 |     return W.astype('float32')
107 | 
108 | 
109 | def tanh(x):
110 |     return tensor.tanh(x)
111 | 
112 | 
113 | def linear(x):
114 |     return x
115 | 
116 | 
117 | def concatenate(tensor_list, axis=0):
118 |     """
119 |     Alternative implementation of `theano.tensor.concatenate`.
120 |     This function does exactly the same thing, but contrary to Theano's own
121 |     implementation, the gradient is implemented on the GPU.
122 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
123 |     because the inverse operation (splitting) needs to be done on the CPU.
124 |     This implementation does not have that problem.
125 |     :usage:
126 |         >>> x, y = theano.tensor.matrices('x', 'y')
127 |         >>> c = concatenate([x, y], axis=1)
128 |     :parameters:
129 |         - tensor_list : list
130 |             list of Theano tensor expressions that should be concatenated.
131 |         - axis : int
132 |             the tensors will be joined along this axis.
133 |     :returns:
134 |         - out : tensor
135 |             the concatenated tensor expression.
136 |     """
137 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
138 | 
139 |     output_shape = ()
140 |     for k in range(axis):
141 |         output_shape += (tensor_list[0].shape[k],)
142 |     output_shape += (concat_size,)
143 |     for k in range(axis + 1, tensor_list[0].ndim):
144 |         output_shape += (tensor_list[0].shape[k],)
145 | 
146 |     out = tensor.zeros(output_shape)
147 |     offset = 0
148 |     for tt in tensor_list:
149 |         indices = ()
150 |         for k in range(axis):
151 |             indices += (slice(None),)
152 |         indices += (slice(offset, offset + tt.shape[axis]),)
153 |         for k in range(axis + 1, tensor_list[0].ndim):
154 |             indices += (slice(None),)
155 | 
156 |         out = tensor.set_subtensor(out[indices], tt)
157 |         offset += tt.shape[axis]
158 | 
159 |     return out
160 | 
161 | 
162 | # batch preparation, returns padded batch and mask
163 | def prepare_data(seqs_x, maxlen=None, n_words=30000):
164 |     # x: a list of sentences
165 |     lengths_x = [len(s) for s in seqs_x]
166 | 
167 |     # filter according to mexlen
168 |     if maxlen is not None:
169 |         new_seqs_x = []
170 |         new_lengths_x = []
171 |         for l_x, s_x in zip(lengths_x, seqs_x):
172 |             if l_x < maxlen:
173 |                 new_seqs_x.append(s_x)
174 |                 new_lengths_x.append(l_x)
175 |         lengths_x = new_lengths_x
176 |         seqs_x = new_seqs_x
177 | 
178 |         if len(lengths_x) < 1:
179 |             return None, None, None, None
180 | 
181 |     n_samples = len(seqs_x)
182 |     maxlen_x = numpy.max(lengths_x) + 1
183 | 
184 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
185 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
186 |     for idx, s_x in enumerate(seqs_x):
187 |         x[:lengths_x[idx], idx] = s_x
188 |         x_mask[:lengths_x[idx]+1, idx] = 1.
189 | 
190 |     return x, x_mask
191 | 
192 | 
193 | # feedforward layer: affine transformation + point-wise nonlinearity
194 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
195 |                        ortho=True):
196 |     if nin is None:
197 |         nin = options['dim_proj']
198 |     if nout is None:
199 |         nout = options['dim_proj']
200 |     params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
201 |     params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
202 | 
203 |     return params
204 | 
205 | 
206 | def fflayer(tparams, state_below, options, prefix='rconv',
207 |             activ='lambda x: tensor.tanh(x)', **kwargs):
208 |     return eval(activ)(
209 |         tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
210 |         tparams[_p(prefix, 'b')])
211 | 
212 | 
213 | # GRU layer
214 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
215 | 
216 |     if nin is None:
217 |         nin = options['dim_proj']
218 |     if dim is None:
219 |         dim = options['dim_proj']
220 | 
221 |     # embedding to gates transformation weights, biases
222 |     W = numpy.concatenate([norm_weight(nin, dim),
223 |                            norm_weight(nin, dim)], axis=1)
224 |     params[_p(prefix, 'W')] = W
225 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
226 | 
227 |     # recurrent transformation weights for gates
228 |     U = numpy.concatenate([ortho_weight(dim),
229 |                            ortho_weight(dim)], axis=1)
230 |     params[_p(prefix, 'U')] = U
231 | 
232 |     # embedding to hidden state proposal weights, biases
233 |     Wx = norm_weight(nin, dim)
234 |     params[_p(prefix, 'Wx')] = Wx
235 |     params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
236 | 
237 |     # recurrent transformation weights for hidden state proposal
238 |     Ux = ortho_weight(dim)
239 |     params[_p(prefix, 'Ux')] = Ux
240 | 
241 |     return params
242 | 
243 | 
244 | def gru_layer(tparams, state_below, options, prefix='gru',
245 |               mask=None, one_step=False, init_state=None, **kwargs):
246 |     if one_step:
247 |         assert init_state, 'previous state must be provided'
248 | 
249 |     nsteps = state_below.shape[0]
250 | 
251 |     if state_below.ndim == 3:
252 |         n_samples = state_below.shape[1]
253 |     else:
254 |         n_samples = state_below.shape[0]
255 | 
256 |     dim = tparams[_p(prefix, 'Ux')].shape[1]
257 | 
258 |     if mask is None:
259 |         mask = tensor.alloc(1., state_below.shape[0], 1)
260 | 
261 |     # utility function to slice a tensor
262 |     def _slice(_x, n, dim):
263 |         if _x.ndim == 3:
264 |             return _x[:, :, n*dim:(n+1)*dim]
265 |         return _x[:, n*dim:(n+1)*dim]
266 | 
267 |     # state_below is the input word embeddings
268 |     # input to the gates, concatenated
269 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
270 |         tparams[_p(prefix, 'b')]
271 |     # input to compute the hidden state proposal
272 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
273 |         tparams[_p(prefix, 'bx')]
274 | 
275 |     # step function to be used by scan
276 |     # arguments    | sequences |outputs-info| non-seqs
277 |     def _step_slice(m_, x_, xx_,  h_,          U, Ux):
278 |         preact = tensor.dot(h_, U)
279 |         preact += x_
280 | 
281 |         # reset and update gates
282 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
283 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
284 | 
285 |         # compute the hidden state proposal
286 |         preactx = tensor.dot(h_, Ux)
287 |         preactx = preactx * r
288 |         preactx = preactx + xx_
289 | 
290 |         # hidden state proposal
291 |         h = tensor.tanh(preactx)
292 | 
293 |         # leaky integrate and obtain next hidden state
294 |         h = u * h_ + (1. - u) * h
295 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
296 | 
297 |         return h
298 | 
299 |     # prepare scan arguments
300 |     seqs = [mask, state_below_, state_belowx]
301 |     _step = _step_slice
302 |     shared_vars = [tparams[_p(prefix, 'U')],
303 |                    tparams[_p(prefix, 'Ux')]]
304 | 
305 |     # set initial state to all zeros
306 |     if init_state is None:
307 |         init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0)
308 | 
309 |     if one_step:  # sampling
310 |         rval = _step(*(seqs+[init_state]+shared_vars))
311 |     else:  # training
312 |         rval, updates = theano.scan(_step,
313 |                                     sequences=seqs,
314 |                                     outputs_info=[init_state],
315 |                                     non_sequences=shared_vars,
316 |                                     name=_p(prefix, '_layers'),
317 |                                     n_steps=nsteps,
318 |                                     profile=profile,
319 |                                     strict=True)
320 |     rval = [rval]
321 |     return rval
322 | 
323 | 
324 | # initialize all parameters
325 | def init_params(options):
326 |     params = OrderedDict()
327 |     # embedding
328 |     params['Wemb'] = norm_weight(options['n_words'], options['dim_word'])
329 |     params = get_layer(options['encoder'])[0](options, params,
330 |                                               prefix='encoder',
331 |                                               nin=options['dim_word'],
332 |                                               dim=options['dim'])
333 |     # readout
334 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
335 |                                 nin=options['dim'], nout=options['dim_word'],
336 |                                 ortho=False)
337 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
338 |                                 nin=options['dim_word'],
339 |                                 nout=options['dim_word'], ortho=False)
340 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
341 |                                 nin=options['dim_word'],
342 |                                 nout=options['n_words'])
343 | 
344 |     return params
345 | 
346 | 
347 | # build a training model
348 | def build_model(tparams, options):
349 |     opt_ret = dict()
350 | 
351 |     trng = RandomStreams(1234)
352 |     use_noise = theano.shared(numpy.float32(0.))
353 | 
354 |     # description string: #words x #samples
355 |     x = tensor.matrix('x', dtype='int64')
356 |     x_mask = tensor.matrix('x_mask', dtype='float32')
357 | 
358 |     n_timesteps = x.shape[0]
359 |     n_samples = x.shape[1]
360 | 
361 |     # input
362 |     emb = tparams['Wemb'][x.flatten()]
363 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
364 |     emb_shifted = tensor.zeros_like(emb)
365 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
366 |     emb = emb_shifted
367 |     opt_ret['emb'] = emb
368 | 
369 |     # pass through gru layer, recurrence here
370 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
371 |                                             prefix='encoder',
372 |                                             mask=x_mask)
373 |     proj_h = proj[0]
374 |     opt_ret['proj_h'] = proj_h
375 | 
376 |     # compute word probabilities
377 |     logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
378 |                                     prefix='ff_logit_lstm', activ='linear')
379 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
380 |                                     prefix='ff_logit_prev', activ='linear')
381 |     logit = tensor.tanh(logit_lstm+logit_prev)
382 |     logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit',
383 |                                activ='linear')
384 |     logit_shp = logit.shape
385 |     probs = tensor.nnet.softmax(
386 |         logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
387 | 
388 |     # cost
389 |     x_flat = x.flatten()
390 |     x_flat_idx = tensor.arange(x_flat.shape[0]) * options['n_words'] + x_flat
391 |     cost = -tensor.log(probs.flatten()[x_flat_idx])
392 |     cost = cost.reshape([x.shape[0], x.shape[1]])
393 |     opt_ret['cost_per_sample'] = cost
394 |     cost = (cost * x_mask).sum(0)
395 | 
396 |     return trng, use_noise, x, x_mask, opt_ret, cost
397 | 
398 | 
399 | # build a sampler
400 | def build_sampler(tparams, options, trng):
401 |     # x: 1 x 1
402 |     y = tensor.vector('y_sampler', dtype='int64')
403 |     init_state = tensor.matrix('init_state', dtype='float32')
404 | 
405 |     # if it's the first word, emb should be all zero
406 |     emb = tensor.switch(y[:, None] < 0,
407 |                         tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
408 |                         tparams['Wemb'][y])
409 | 
410 |     # apply one step of gru layer
411 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
412 |                                             prefix='encoder',
413 |                                             mask=None,
414 |                                             one_step=True,
415 |                                             init_state=init_state)
416 |     next_state = proj[0]
417 | 
418 |     # compute the output probability dist and sample
419 |     logit_lstm = get_layer('ff')[1](tparams, next_state, options,
420 |                                     prefix='ff_logit_lstm', activ='linear')
421 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
422 |                                     prefix='ff_logit_prev', activ='linear')
423 |     logit = tensor.tanh(logit_lstm+logit_prev)
424 |     logit = get_layer('ff')[1](tparams, logit, options,
425 |                                prefix='ff_logit', activ='linear')
426 |     next_probs = tensor.nnet.softmax(logit)
427 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
428 | 
429 |     # next word probability
430 |     print 'Building f_next..',
431 |     inps = [y, init_state]
432 |     outs = [next_probs, next_sample, next_state]
433 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
434 |     print 'Done'
435 | 
436 |     return f_next
437 | 
438 | 
439 | # generate sample
440 | def gen_sample(tparams, f_next, options, trng=None, maxlen=30, argmax=False):
441 | 
442 |     sample = []
443 |     sample_score = 0
444 | 
445 |     # initial token is indicated by a -1 and initial state is zero
446 |     next_w = -1 * numpy.ones((1,)).astype('int64')
447 |     next_state = numpy.zeros((1, options['dim'])).astype('float32')
448 | 
449 |     for ii in xrange(maxlen):
450 |         inps = [next_w, next_state]
451 |         ret = f_next(*inps)
452 |         next_p, next_w, next_state = ret[0], ret[1], ret[2]
453 | 
454 |         if argmax:
455 |             nw = next_p[0].argmax()
456 |         else:
457 |             nw = next_w[0]
458 |         sample.append(nw)
459 |         sample_score += next_p[0, nw]
460 |         if nw == 0:
461 |             break
462 | 
463 |     return sample, sample_score
464 | 
465 | 
466 | # calculate the log probablities on a given corpus using language model
467 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True):
468 |     probs = []
469 | 
470 |     n_done = 0
471 | 
472 |     for x in iterator:
473 |         n_done += len(x)
474 | 
475 |         x, x_mask = prepare_data(x, n_words=options['n_words'])
476 | 
477 |         pprobs = f_log_probs(x, x_mask)
478 |         for pp in pprobs:
479 |             probs.append(pp)
480 | 
481 |         if numpy.isnan(numpy.mean(probs)):
482 |             ipdb.set_trace()
483 | 
484 |         if verbose:
485 |             print >>sys.stderr, '%d samples computed' % (n_done)
486 | 
487 |     return numpy.array(probs)
488 | 
489 | 
490 | # optimizers
491 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
492 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8):
493 | 
494 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
495 |                for k, p in tparams.iteritems()]
496 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
497 | 
498 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)
499 | 
500 |     updates = []
501 | 
502 |     t_prev = theano.shared(numpy.float32(0.))
503 |     t = t_prev + 1.
504 |     lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t)
505 | 
506 |     for p, g in zip(tparams.values(), gshared):
507 |         m = theano.shared(p.get_value() * 0., p.name + '_mean')
508 |         v = theano.shared(p.get_value() * 0., p.name + '_variance')
509 |         m_t = beta1 * m + (1. - beta1) * g
510 |         v_t = beta2 * v + (1. - beta2) * g**2
511 |         step = lr_t * m_t / (tensor.sqrt(v_t) + e)
512 |         p_t = p - step
513 |         updates.append((m, m_t))
514 |         updates.append((v, v_t))
515 |         updates.append((p, p_t))
516 |     updates.append((t_prev, t))
517 | 
518 |     f_update = theano.function([lr], [], updates=updates,
519 |                                on_unused_input='ignore', profile=profile)
520 | 
521 |     return f_grad_shared, f_update
522 | 
523 | 
524 | def adadelta(lr, tparams, grads, inp, cost):
525 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
526 |                                   name='%s_grad' % k)
527 |                     for k, p in tparams.iteritems()]
528 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
529 |                                  name='%s_rup2' % k)
530 |                    for k, p in tparams.iteritems()]
531 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
532 |                                     name='%s_rgrad2' % k)
533 |                       for k, p in tparams.iteritems()]
534 | 
535 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
536 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
537 |              for rg2, g in zip(running_grads2, grads)]
538 | 
539 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
540 |                                     profile=profile)
541 | 
542 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
543 |              for zg, ru2, rg2 in zip(zipped_grads,
544 |                                      running_up2,
545 |                                      running_grads2)]
546 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
547 |              for ru2, ud in zip(running_up2, updir)]
548 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
549 | 
550 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
551 |                                on_unused_input='ignore', profile=profile)
552 | 
553 |     return f_grad_shared, f_update
554 | 
555 | 
556 | def rmsprop(lr, tparams, grads, inp, cost):
557 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
558 |                                   name='%s_grad' % k)
559 |                     for k, p in tparams.iteritems()]
560 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
561 |                                    name='%s_rgrad' % k)
562 |                      for k, p in tparams.iteritems()]
563 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
564 |                                     name='%s_rgrad2' % k)
565 |                       for k, p in tparams.iteritems()]
566 | 
567 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
568 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
569 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
570 |              for rg2, g in zip(running_grads2, grads)]
571 | 
572 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
573 |                                     profile=profile)
574 | 
575 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
576 |                            name='%s_updir' % k)
577 |              for k, p in tparams.iteritems()]
578 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
579 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
580 |                                             running_grads2)]
581 |     param_up = [(p, p + udn[1])
582 |                 for p, udn in zip(itemlist(tparams), updir_new)]
583 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
584 |                                on_unused_input='ignore', profile=profile)
585 | 
586 |     return f_grad_shared, f_update
587 | 
588 | 
589 | def sgd(lr, tparams, grads, x, mask, y, cost):
590 | 
591 |     # allocate gradients and set them all to zero
592 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
593 |                for k, p in tparams.iteritems()]
594 | 
595 |     # create gradient copying list,
596 |     # from grads (tensor variable) to gshared (shared variable)
597 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
598 | 
599 |     # compile theano function to compute cost and copy gradients
600 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
601 |                                     profile=profile)
602 | 
603 |     # define the update step rule
604 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
605 | 
606 |     # compile a function for update
607 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
608 | 
609 |     return f_grad_shared, f_update
610 | 
611 | 
612 | def train(dim_word=100,  # word vector dimensionality
613 |           dim=1000,  # the number of GRU units
614 |           encoder='gru',
615 |           patience=10,  # early stopping patience
616 |           max_epochs=5000,
617 |           finish_after=10000000,  # finish after this many updates
618 |           dispFreq=100,
619 |           decay_c=0.,  # L2 weight decay penalty
620 |           lrate=0.01,
621 |           n_words=100000,  # vocabulary size
622 |           maxlen=100,  # maximum length of the description
623 |           optimizer='rmsprop',
624 |           batch_size=16,
625 |           valid_batch_size=16,
626 |           saveto='model.npz',
627 |           validFreq=1000,
628 |           saveFreq=1000,  # save the parameters after every saveFreq updates
629 |           sampleFreq=100,  # generate some samples after every sampleFreq
630 |           dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz',
631 |           valid_dataset='../data/dev/newstest2011.en.tok',
632 |           dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/'
633 |           'wiki.tok.txt.gz.pkl',
634 |           use_dropout=False,
635 |           reload_=False):
636 | 
637 |     # Model options
638 |     model_options = locals().copy()
639 | 
640 |     # load dictionary
641 |     with open(dictionary, 'rb') as f:
642 |         worddicts = pkl.load(f)
643 | 
644 |     # invert dictionary
645 |     worddicts_r = dict()
646 |     for kk, vv in worddicts.iteritems():
647 |         worddicts_r[vv] = kk
648 | 
649 |     # reload options
650 |     if reload_ and os.path.exists(saveto):
651 |         with open('%s.pkl' % saveto, 'rb') as f:
652 |             model_options = pkl.load(f)
653 | 
654 |     print 'Loading data'
655 |     train = TextIterator(dataset,
656 |                          dictionary,
657 |                          n_words_source=n_words,
658 |                          batch_size=batch_size,
659 |                          maxlen=maxlen)
660 |     valid = TextIterator(valid_dataset,
661 |                          dictionary,
662 |                          n_words_source=n_words,
663 |                          batch_size=valid_batch_size,
664 |                          maxlen=maxlen)
665 | 
666 |     print 'Building model'
667 |     params = init_params(model_options)
668 | 
669 |     # reload parameters
670 |     if reload_ and os.path.exists(saveto):
671 |         params = load_params(saveto, params)
672 | 
673 |     # create shared variables for parameters
674 |     tparams = init_tparams(params)
675 | 
676 |     # build the symbolic computational graph
677 |     trng, use_noise, \
678 |         x, x_mask, \
679 |         opt_ret, \
680 |         cost = \
681 |         build_model(tparams, model_options)
682 |     inps = [x, x_mask]
683 | 
684 |     print 'Buliding sampler'
685 |     f_next = build_sampler(tparams, model_options, trng)
686 | 
687 |     # before any regularizer
688 |     print 'Building f_log_probs...',
689 |     f_log_probs = theano.function(inps, cost, profile=profile)
690 |     print 'Done'
691 | 
692 |     cost = cost.mean()
693 | 
694 |     # apply L2 regularization on weights
695 |     if decay_c > 0.:
696 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
697 |         weight_decay = 0.
698 |         for kk, vv in tparams.iteritems():
699 |             weight_decay += (vv ** 2).sum()
700 |         weight_decay *= decay_c
701 |         cost += weight_decay
702 | 
703 |     # after any regularizer - compile the computational graph for cost
704 |     print 'Building f_cost...',
705 |     f_cost = theano.function(inps, cost, profile=profile)
706 |     print 'Done'
707 | 
708 |     print 'Computing gradient...',
709 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
710 |     print 'Done'
711 | 
712 |     # compile the optimizer, the actual computational graph is compiled here
713 |     lr = tensor.scalar(name='lr')
714 |     print 'Building optimizers...',
715 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
716 |     print 'Done'
717 | 
718 |     print 'Optimization'
719 | 
720 |     history_errs = []
721 |     # reload history
722 |     if reload_ and os.path.exists(saveto):
723 |         history_errs = list(numpy.load(saveto)['history_errs'])
724 |     best_p = None
725 |     bad_count = 0
726 | 
727 |     if validFreq == -1:
728 |         validFreq = len(train[0])/batch_size
729 |     if saveFreq == -1:
730 |         saveFreq = len(train[0])/batch_size
731 |     if sampleFreq == -1:
732 |         sampleFreq = len(train[0])/batch_size
733 | 
734 |     # Training loop
735 |     uidx = 0
736 |     estop = False
737 |     bad_counter = 0
738 |     for eidx in xrange(max_epochs):
739 |         n_samples = 0
740 | 
741 |         for x in train:
742 |             n_samples += len(x)
743 |             uidx += 1
744 |             use_noise.set_value(1.)
745 | 
746 |             # pad batch and create mask
747 |             x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words)
748 | 
749 |             if x is None:
750 |                 print 'Minibatch with zero sample under length ', maxlen
751 |                 uidx -= 1
752 |                 continue
753 | 
754 |             ud_start = time.time()
755 | 
756 |             # compute cost, grads and copy grads to shared variables
757 |             cost = f_grad_shared(x, x_mask)
758 | 
759 |             # do the update on parameters
760 |             f_update(lrate)
761 | 
762 |             ud = time.time() - ud_start
763 | 
764 |             # check for bad numbers
765 |             if numpy.isnan(cost) or numpy.isinf(cost):
766 |                 print 'NaN detected'
767 |                 return 1.
768 | 
769 |             # verbose
770 |             if numpy.mod(uidx, dispFreq) == 0:
771 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
772 | 
773 |             # save the best model so far
774 |             if numpy.mod(uidx, saveFreq) == 0:
775 |                 print 'Saving...',
776 | 
777 |                 if best_p is not None:
778 |                     params = best_p
779 |                 else:
780 |                     params = unzip(tparams)
781 |                 numpy.savez(saveto, history_errs=history_errs, **params)
782 |                 pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
783 |                 print 'Done'
784 | 
785 |             # generate some samples with the model and display them
786 |             if numpy.mod(uidx, sampleFreq) == 0:
787 |                 # FIXME: random selection?
788 |                 for jj in xrange(5):
789 |                     sample, score = gen_sample(tparams, f_next,
790 |                                                model_options, trng=trng,
791 |                                                maxlen=30, argmax=False)
792 |                     print 'Sample ', jj, ': ',
793 |                     ss = sample
794 |                     for vv in ss:
795 |                         if vv == 0:
796 |                             break
797 |                         if vv in worddicts_r:
798 |                             print worddicts_r[vv],
799 |                         else:
800 |                             print 'UNK',
801 |                     print
802 | 
803 |             # validate model on validation set and early stop if necessary
804 |             if numpy.mod(uidx, validFreq) == 0:
805 |                 use_noise.set_value(0.)
806 |                 valid_errs = pred_probs(f_log_probs, prepare_data,
807 |                                         model_options, valid)
808 |                 valid_err = valid_errs.mean()
809 |                 history_errs.append(valid_err)
810 | 
811 |                 if uidx == 0 or valid_err <= numpy.array(history_errs).min():
812 |                     best_p = unzip(tparams)
813 |                     bad_counter = 0
814 |                 if len(history_errs) > patience and valid_err >= \
815 |                         numpy.array(history_errs)[:-patience].min():
816 |                     bad_counter += 1
817 |                     if bad_counter > patience:
818 |                         print 'Early Stop!'
819 |                         estop = True
820 |                         break
821 | 
822 |                 if numpy.isnan(valid_err):
823 |                     ipdb.set_trace()
824 | 
825 |                 print 'Valid ', valid_err
826 | 
827 |             # finish after this many updates
828 |             if uidx >= finish_after:
829 |                 print 'Finishing after %d iterations!' % uidx
830 |                 estop = True
831 |                 break
832 | 
833 |         print 'Seen %d samples' % n_samples
834 | 
835 |         if estop:
836 |             break
837 | 
838 |     if best_p is not None:
839 |         zipp(best_p, tparams)
840 | 
841 |     use_noise.set_value(0.)
842 |     valid_err = pred_probs(f_log_probs, prepare_data,
843 |                            model_options, valid).mean()
844 | 
845 |     print 'Valid ', valid_err
846 | 
847 |     params = copy.copy(best_p)
848 |     numpy.savez(saveto, zipped_params=best_p,
849 |                 history_errs=history_errs,
850 |                 **params)
851 | 
852 |     return valid_err
853 | 
854 | 
855 | if __name__ == '__main__':
856 |     pass
857 | 


--------------------------------------------------------------------------------
/session0/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=48:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_lm.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/session0/train_lm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from lm import train
 4 | 
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     validerr = train(
 9 |         saveto=params['model'][0],
10 |         reload_=params['reload'][0],
11 |         dim_word=params['dim_word'][0],
12 |         dim=params['dim'][0],
13 |         n_words=params['n-words'][0],
14 |         decay_c=params['decay-c'][0],
15 |         lrate=params['learning-rate'][0],
16 |         optimizer=params['optimizer'][0],
17 |         maxlen=30,
18 |         batch_size=32,
19 |         valid_batch_size=16,
20 |         validFreq=5000,
21 |         dispFreq=10,
22 |         saveFreq=1000,
23 |         sampleFreq=1000,
24 |         dataset='/ichec/work/dl4mt_data/nec_files/wiki.tok.txt.gz',
25 |         valid_dataset='/ichec/work/dl4mt_data/nec_files/newstest2011.en.tok',
26 |         dictionary='/ichec/work/dl4mt_data/nec_files/wiki.tok.txt.gz.pkl',
27 |         use_dropout=params['use-dropout'][0])
28 |     return validerr
29 | 
30 | if __name__ == '__main__':
31 |     main(0, {
32 |         'model': ['/ichec/home/users/%s/models/model_session0.npz' %
33 |                   os.environ['USER']],
34 |         'dim_word': [512],
35 |         'dim': [1024],
36 |         'n-words': [30000],
37 |         'optimizer': ['adadelta'],
38 |         'decay-c': [0.],
39 |         'use-dropout': [False],
40 |         'learning-rate': [0.0001],
41 |         'reload': [False]})
42 | 


--------------------------------------------------------------------------------
/session1/README.md:
--------------------------------------------------------------------------------
1 | Simple encoder-decoder model for machine translation
2 | 
3 | 


--------------------------------------------------------------------------------
/session1/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import cPickle as pkl
  4 | import gzip
  5 | 
  6 | 
  7 | def fopen(filename, mode='r'):
  8 |     if filename.endswith('.gz'):
  9 |         return gzip.open(filename, mode)
 10 |     return open(filename, mode)
 11 | 
 12 | 
 13 | class TextIterator:
 14 |     """Simple Bitext iterator."""
 15 |     def __init__(self, source, target,
 16 |                  source_dict, target_dict,
 17 |                  batch_size=128,
 18 |                  maxlen=100,
 19 |                  n_words_source=-1,
 20 |                  n_words_target=-1):
 21 |         self.source = fopen(source, 'r')
 22 |         self.target = fopen(target, 'r')
 23 |         with open(source_dict, 'rb') as f:
 24 |             self.source_dict = pkl.load(f)
 25 |         with open(target_dict, 'rb') as f:
 26 |             self.target_dict = pkl.load(f)
 27 | 
 28 |         self.batch_size = batch_size
 29 |         self.maxlen = maxlen
 30 | 
 31 |         self.n_words_source = n_words_source
 32 |         self.n_words_target = n_words_target
 33 | 
 34 |         self.source_buffer = []
 35 |         self.target_buffer = []
 36 |         self.k = batch_size * 20
 37 | 
 38 |         self.end_of_data = False
 39 | 
 40 |     def __iter__(self):
 41 |         return self
 42 | 
 43 |     def reset(self):
 44 |         self.source.seek(0)
 45 |         self.target.seek(0)
 46 | 
 47 |     def next(self):
 48 |         if self.end_of_data:
 49 |             self.end_of_data = False
 50 |             self.reset()
 51 |             raise StopIteration
 52 | 
 53 |         source = []
 54 |         target = []
 55 | 
 56 |         # fill buffer, if it's empty
 57 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
 58 | 
 59 |         if len(self.source_buffer) == 0:
 60 |             for k_ in xrange(self.k):
 61 |                 ss = self.source.readline()
 62 |                 if ss == "":
 63 |                     break
 64 |                 tt = self.target.readline()
 65 |                 if tt == "":
 66 |                     break
 67 | 
 68 |                 self.source_buffer.append(ss.strip().split())
 69 |                 self.target_buffer.append(tt.strip().split())
 70 | 
 71 |             # sort by target buffer
 72 |             tlen = numpy.array([len(t) for t in self.target_buffer])
 73 |             tidx = tlen.argsort()
 74 | 
 75 |             _sbuf = [self.source_buffer[i] for i in tidx]
 76 |             _tbuf = [self.target_buffer[i] for i in tidx]
 77 | 
 78 |             self.source_buffer = _sbuf
 79 |             self.target_buffer = _tbuf
 80 | 
 81 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
 82 |             self.end_of_data = False
 83 |             self.reset()
 84 |             raise StopIteration
 85 | 
 86 |         try:
 87 | 
 88 |             # actual work here
 89 |             while True:
 90 | 
 91 |                 # read from source file and map to word index
 92 |                 try:
 93 |                     ss = self.source_buffer.pop()
 94 |                 except IndexError:
 95 |                     break
 96 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
 97 |                       for w in ss]
 98 |                 if self.n_words_source > 0:
 99 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
100 | 
101 |                 # read from source file and map to word index
102 |                 tt = self.target_buffer.pop()
103 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
104 |                       for w in tt]
105 |                 if self.n_words_target > 0:
106 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
107 | 
108 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
109 |                     continue
110 | 
111 |                 source.append(ss)
112 |                 target.append(tt)
113 | 
114 |                 if len(source) >= self.batch_size or \
115 |                         len(target) >= self.batch_size:
116 |                     break
117 |         except IOError:
118 |             self.end_of_data = True
119 | 
120 |         if len(source) <= 0 or len(target) <= 0:
121 |             self.end_of_data = False
122 |             self.reset()
123 |             raise StopIteration
124 | 
125 |         return source, target
126 | 


--------------------------------------------------------------------------------
/session1/encode.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Encode a source file using the encoder of a trained translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import cPickle as pkl
  8 | 
  9 | from nmt import (build_sampler, gen_sample, load_params,
 10 |                  init_params, init_tparams)
 11 | 
 12 | from multiprocessing import Process, Queue
 13 | 
 14 | 
 15 | def encode_model(queue, rqueue, pid, model, options):
 16 | 
 17 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 18 |     trng = RandomStreams(1234)
 19 | 
 20 |     # allocate model parameters
 21 |     params = init_params(options)
 22 | 
 23 |     # load model parameters and set theano shared variables
 24 |     params = load_params(model, params)
 25 |     tparams = init_tparams(params)
 26 | 
 27 |     # word index
 28 |     f_init, f_next = build_sampler(tparams, options, trng)
 29 | 
 30 |     def _encode(seq):
 31 |         # encode the source sentence
 32 |         code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1]
 33 |         return code
 34 | 
 35 |     while True:
 36 |         req = queue.get()
 37 |         if req is None:
 38 |             break
 39 | 
 40 |         idx, x = req[0], req[1]
 41 |         print pid, '-', idx
 42 |         cod = _encode(x)
 43 | 
 44 |         rqueue.put((idx, cod))
 45 | 
 46 |     return
 47 | 
 48 | 
 49 | def main(model, dictionary, source_file, saveto, 
 50 |          n_process=5, chr_level=False):
 51 | 
 52 |     # load model model_options
 53 |     with open('%s.pkl' % model, 'rb') as f:
 54 |         options = pkl.load(f)
 55 | 
 56 |     # load source dictionary and invert
 57 |     with open(dictionary, 'rb') as f:
 58 |         word_dict = pkl.load(f)
 59 |     word_idict = dict()
 60 |     for kk, vv in word_dict.iteritems():
 61 |         word_idict[vv] = kk
 62 |     word_idict[0] = '<eos>'
 63 |     word_idict[1] = 'UNK'
 64 | 
 65 |     # create input and output queues for processes
 66 |     queue = Queue()
 67 |     rqueue = Queue()
 68 |     processes = [None] * n_process
 69 |     for midx in xrange(n_process):
 70 |         processes[midx] = Process(
 71 |             target=encode_model,
 72 |             args=(queue, rqueue, midx, model, options,))
 73 |         processes[midx].start()
 74 | 
 75 |     def _send_jobs(fname):
 76 |         with open(fname, 'r') as f:
 77 |             for idx, line in enumerate(f):
 78 |                 if chr_level:
 79 |                     words = list(line.decode('utf-8').strip())
 80 |                 else:
 81 |                     words = line.strip().split()
 82 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
 83 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
 84 |                 x += [0]
 85 |                 queue.put((idx, x))
 86 |         return idx+1
 87 | 
 88 |     def _finish_processes():
 89 |         for midx in xrange(n_process):
 90 |             queue.put(None)
 91 | 
 92 |     def _retrieve_jobs(n_samples):
 93 |         codes = [None] * n_samples
 94 |         for idx in xrange(n_samples):
 95 |             resp = rqueue.get()
 96 |             codes[resp[0]] = resp[1]
 97 |             if numpy.mod(idx, 10) == 0:
 98 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
 99 |         return codes
100 | 
101 |     print 'Translating ', source_file, '...'
102 |     n_samples = _send_jobs(source_file)
103 |     codes = numpy.array(_retrieve_jobs(n_samples))
104 |     _finish_processes()
105 |     if not saveto.endswith('npy'):
106 |         saveto = saveto + '.npy'
107 |     numpy.save(saveto, codes)
108 |     print 'Done'
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument('-c', action="store_true", default=False)
114 |     parser.add_argument('-p', type=int, default=4)
115 |     parser.add_argument('model', type=str)
116 |     parser.add_argument('dictionary', type=str)
117 |     parser.add_argument('source', type=str)
118 |     parser.add_argument('saveto', type=str)
119 | 
120 |     args = parser.parse_args()
121 | 
122 |     main(args.model, args.dictionary, args.source,
123 |          args.saveto, n_process=args.p, chr_level=args.c)
124 | 


--------------------------------------------------------------------------------
/session1/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=4:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./translate.py -n -p 20 \
12 | 	$HOME/models/model_session1.npz  \
13 | 	$HOME/data/europarl-v7.fr-en.en.tok.pkl \
14 | 	$HOME/data/europarl-v7.fr-en.fr.tok.pkl \
15 | 	$HOME/data/newstest2011.en.tok \
16 | 	./newstest2011.trans.fr.tok
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/session1/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/session1/train_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt_all.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/session1/train_nmt.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from nmt import train
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     validerr = train(saveto=params['model'][0],
 9 |                                         reload_=params['reload'][0],
10 |                                         dim_word=params['dim_word'][0],
11 |                                         dim=params['dim'][0],
12 |                                         n_words=params['n-words'][0],
13 |                                         n_words_src=params['n-words'][0],
14 |                                         decay_c=params['decay-c'][0],
15 |                                         lrate=params['learning-rate'][0],
16 |                                         optimizer=params['optimizer'][0],
17 |                                         maxlen=50,
18 |                                         batch_size=32,
19 |                                         valid_batch_size=32,
20 | 					datasets=['/ichec/home/users/%s/data/europarl-v7.fr-en.en.tok'%os.environ['USER'],
21 | 					'/ichec/home/users/%s/data/europarl-v7.fr-en.fr.tok'%os.environ['USER']],
22 | 					valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'],
23 | 					'/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']],
24 | 					dictionaries=['/ichec/home/users/%s/data/europarl-v7.fr-en.en.tok.pkl'%os.environ['USER'],
25 | 					'/ichec/home/users/%s/data/europarl-v7.fr-en.fr.tok.pkl'%os.environ['USER']],
26 |                                         validFreq=5000,
27 |                                         dispFreq=10,
28 |                                         saveFreq=5000,
29 |                                         sampleFreq=1000,
30 |                                         use_dropout=params['use-dropout'][0],
31 |                                         overwrite=False)
32 |     return validerr
33 | 
34 | if __name__ == '__main__':
35 |     main(0, {
36 |         'model': ['/ichec/home/users/%s/models/model_session1.npz'%os.environ['USER']],
37 |         'dim_word': [500],
38 |         'dim': [1024],
39 |         'n-words': [30000],
40 |         'optimizer': ['adadelta'],
41 |         'decay-c': [0.],
42 |         'use-dropout': [False],
43 |         'learning-rate': [0.0001],
44 |         'reload': [False]})
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/session1/train_nmt_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nmt import train
 4 | 
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     username = os.environ['USER']
 9 |     validerr = train(
10 |         saveto=params['model'][0],
11 |         reload_=params['reload'][0],
12 |         dim_word=params['dim_word'][0],
13 |         dim=params['dim'][0],
14 |         n_words=params['n-words'][0],
15 |         n_words_src=params['n-words'][0],
16 |         decay_c=params['decay-c'][0],
17 |         lrate=params['learning-rate'][0],
18 |         optimizer=params['optimizer'][0],
19 |         maxlen=50,
20 |         batch_size=32,
21 |         valid_batch_size=32,
22 |         datasets=[
23 |             '/ichec/home/users/%s/data/all.en.concat.shuf.gz' % username,
24 |             '/ichec/home/users/%s/data/all.fr.concat.shuf.gz' % username],
25 |         valid_datasets=[
26 |             '/ichec/home/users/%s/data/newstest2011.en.tok' % username,
27 |             '/ichec/home/users/%s/data/newstest2011.fr.tok' % username],
28 |         dictionaries=[
29 |             '/ichec/home/users/%s/data/all.en.concat.gz.pkl' % username,
30 |             '/ichec/home/users/%s/data/all.fr.concat.gz.pkl' % username],
31 |         validFreq=5000,
32 |         dispFreq=10,
33 |         saveFreq=5000,
34 |         sampleFreq=1000,
35 |         use_dropout=params['use-dropout'][0],
36 |         overwrite=False)
37 |     return validerr
38 | 
39 | if __name__ == '__main__':
40 |     main(0, {
41 |         'model': [
42 |             '/ichec/home/users/%s/models/model_session1_all.npz' %
43 |             os.environ['USER']],
44 |         'dim_word': [500],
45 |         'dim': [1024],
46 |         'n-words': [30000],
47 |         'optimizer': ['adadelta'],
48 |         'decay-c': [0.],
49 |         'use-dropout': [False],
50 |         'learning-rate': [0.0001],
51 |         'reload': [False]})
52 | 


--------------------------------------------------------------------------------
/session1/translate.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import cPickle as pkl
  8 | 
  9 | from nmt import (build_sampler, gen_sample, load_params,
 10 |                  init_params, init_tparams)
 11 | 
 12 | from multiprocessing import Process, Queue
 13 | 
 14 | 
 15 | def translate_model(queue, rqueue, pid, model, options, k, normalize):
 16 | 
 17 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 18 |     from theano import shared
 19 |     trng = RandomStreams(1234)
 20 |     use_noise = shared(numpy.float32(0.))
 21 | 
 22 |     # allocate model parameters
 23 |     params = init_params(options)
 24 | 
 25 |     # load model parameters and set theano shared variables
 26 |     params = load_params(model, params)
 27 |     tparams = init_tparams(params)
 28 | 
 29 |     # word index
 30 |     f_init, f_next = build_sampler(tparams, options, trng, use_noise)
 31 | 
 32 |     def _translate(seq):
 33 |         # sample given an input sequence and obtain scores
 34 |         sample, score = gen_sample(tparams, f_init, f_next,
 35 |                                    numpy.array(seq).reshape([len(seq), 1]),
 36 |                                    options, trng=trng, k=k, maxlen=200,
 37 |                                    stochastic=False)
 38 | 
 39 |         # normalize scores according to sequence lengths
 40 |         if normalize:
 41 |             lengths = numpy.array([len(s) for s in sample])
 42 |             score = score / lengths
 43 |         sidx = numpy.argmin(score)
 44 |         return sample[sidx]
 45 | 
 46 |     while True:
 47 |         req = queue.get()
 48 |         if req is None:
 49 |             break
 50 | 
 51 |         idx, x = req[0], req[1]
 52 |         print pid, '-', idx
 53 |         seq = _translate(x)
 54 | 
 55 |         rqueue.put((idx, seq))
 56 | 
 57 |     return
 58 | 
 59 | 
 60 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
 61 |          normalize=False, n_process=5, chr_level=False):
 62 | 
 63 |     # load model model_options
 64 |     with open('%s.pkl' % model, 'rb') as f:
 65 |         options = pkl.load(f)
 66 | 
 67 |     # load source dictionary and invert
 68 |     with open(dictionary, 'rb') as f:
 69 |         word_dict = pkl.load(f)
 70 |     word_idict = dict()
 71 |     for kk, vv in word_dict.iteritems():
 72 |         word_idict[vv] = kk
 73 |     word_idict[0] = '<eos>'
 74 |     word_idict[1] = 'UNK'
 75 | 
 76 |     # load target dictionary and invert
 77 |     with open(dictionary_target, 'rb') as f:
 78 |         word_dict_trg = pkl.load(f)
 79 |     word_idict_trg = dict()
 80 |     for kk, vv in word_dict_trg.iteritems():
 81 |         word_idict_trg[vv] = kk
 82 |     word_idict_trg[0] = '<eos>'
 83 |     word_idict_trg[1] = 'UNK'
 84 | 
 85 |     # create input and output queues for processes
 86 |     queue = Queue()
 87 |     rqueue = Queue()
 88 |     processes = [None] * n_process
 89 |     for midx in xrange(n_process):
 90 |         processes[midx] = Process(
 91 |             target=translate_model,
 92 |             args=(queue, rqueue, midx, model, options, k, normalize,))
 93 |         processes[midx].start()
 94 | 
 95 |     # utility function
 96 |     def _seqs2words(caps):
 97 |         capsw = []
 98 |         for cc in caps:
 99 |             ww = []
100 |             for w in cc:
101 |                 if w == 0:
102 |                     break
103 |                 ww.append(word_idict_trg[w])
104 |             capsw.append(' '.join(ww))
105 |         return capsw
106 | 
107 |     def _send_jobs(fname):
108 |         with open(fname, 'r') as f:
109 |             for idx, line in enumerate(f):
110 |                 if chr_level:
111 |                     words = list(line.decode('utf-8').strip())
112 |                 else:
113 |                     words = line.strip().split()
114 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
115 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
116 |                 x += [0]
117 |                 queue.put((idx, x))
118 |         return idx+1
119 | 
120 |     def _finish_processes():
121 |         for midx in xrange(n_process):
122 |             queue.put(None)
123 | 
124 |     def _retrieve_jobs(n_samples):
125 |         trans = [None] * n_samples
126 |         for idx in xrange(n_samples):
127 |             resp = rqueue.get()
128 |             trans[resp[0]] = resp[1]
129 |             if numpy.mod(idx, 10) == 0:
130 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
131 |         return trans
132 | 
133 |     print 'Translating ', source_file, '...'
134 |     n_samples = _send_jobs(source_file)
135 |     trans = _seqs2words(_retrieve_jobs(n_samples))
136 |     _finish_processes()
137 |     with open(saveto, 'w') as f:
138 |         print >>f, '\n'.join(trans)
139 |     print 'Done'
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument('-k', type=int, default=5)
145 |     parser.add_argument('-p', type=int, default=5)
146 |     parser.add_argument('-n', action="store_true", default=False)
147 |     parser.add_argument('-c', action="store_true", default=False)
148 |     parser.add_argument('model', type=str)
149 |     parser.add_argument('dictionary', type=str)
150 |     parser.add_argument('dictionary_target', type=str)
151 |     parser.add_argument('source', type=str)
152 |     parser.add_argument('saveto', type=str)
153 | 
154 |     args = parser.parse_args()
155 | 
156 |     main(args.model, args.dictionary, args.dictionary_target, args.source,
157 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
158 |          chr_level=args.c)
159 | 


--------------------------------------------------------------------------------
/session2/README.md:
--------------------------------------------------------------------------------
1 | Attention-based encoder-decoder model for machine translation
2 | 
3 | ## Training
4 | Change the hard-coded paths to data in `nmt.py` then run
5 | ```
6 | THEANO_FLAGS=device=gpu,floatX=float32 python train_nmt.py 
7 | ```
8 | 


--------------------------------------------------------------------------------
/session2/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import cPickle as pkl
  4 | import gzip
  5 | 
  6 | 
  7 | def fopen(filename, mode='r'):
  8 |     if filename.endswith('.gz'):
  9 |         return gzip.open(filename, mode)
 10 |     return open(filename, mode)
 11 | 
 12 | 
 13 | class TextIterator:
 14 |     """Simple Bitext iterator."""
 15 |     def __init__(self, source, target,
 16 |                  source_dict, target_dict,
 17 |                  batch_size=128,
 18 |                  maxlen=100,
 19 |                  n_words_source=-1,
 20 |                  n_words_target=-1):
 21 |         self.source = fopen(source, 'r')
 22 |         self.target = fopen(target, 'r')
 23 |         with open(source_dict, 'rb') as f:
 24 |             self.source_dict = pkl.load(f)
 25 |         with open(target_dict, 'rb') as f:
 26 |             self.target_dict = pkl.load(f)
 27 | 
 28 |         self.batch_size = batch_size
 29 |         self.maxlen = maxlen
 30 | 
 31 |         self.n_words_source = n_words_source
 32 |         self.n_words_target = n_words_target
 33 | 
 34 |         self.source_buffer = []
 35 |         self.target_buffer = []
 36 |         self.k = batch_size * 20
 37 | 
 38 |         self.end_of_data = False
 39 | 
 40 |     def __iter__(self):
 41 |         return self
 42 | 
 43 |     def reset(self):
 44 |         self.source.seek(0)
 45 |         self.target.seek(0)
 46 | 
 47 |     def next(self):
 48 |         if self.end_of_data:
 49 |             self.end_of_data = False
 50 |             self.reset()
 51 |             raise StopIteration
 52 | 
 53 |         source = []
 54 |         target = []
 55 | 
 56 |         # fill buffer, if it's empty
 57 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
 58 | 
 59 |         if len(self.source_buffer) == 0:
 60 |             for k_ in xrange(self.k):
 61 |                 ss = self.source.readline()
 62 |                 if ss == "":
 63 |                     break
 64 |                 tt = self.target.readline()
 65 |                 if tt == "":
 66 |                     break
 67 | 
 68 |                 self.source_buffer.append(ss.strip().split())
 69 |                 self.target_buffer.append(tt.strip().split())
 70 | 
 71 |             # sort by target buffer
 72 |             tlen = numpy.array([len(t) for t in self.target_buffer])
 73 |             tidx = tlen.argsort()
 74 | 
 75 |             _sbuf = [self.source_buffer[i] for i in tidx]
 76 |             _tbuf = [self.target_buffer[i] for i in tidx]
 77 | 
 78 |             self.source_buffer = _sbuf
 79 |             self.target_buffer = _tbuf
 80 | 
 81 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
 82 |             self.end_of_data = False
 83 |             self.reset()
 84 |             raise StopIteration
 85 | 
 86 |         try:
 87 | 
 88 |             # actual work here
 89 |             while True:
 90 | 
 91 |                 # read from source file and map to word index
 92 |                 try:
 93 |                     ss = self.source_buffer.pop()
 94 |                 except IndexError:
 95 |                     break
 96 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
 97 |                       for w in ss]
 98 |                 if self.n_words_source > 0:
 99 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
100 | 
101 |                 # read from source file and map to word index
102 |                 tt = self.target_buffer.pop()
103 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
104 |                       for w in tt]
105 |                 if self.n_words_target > 0:
106 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
107 | 
108 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
109 |                     continue
110 | 
111 |                 source.append(ss)
112 |                 target.append(tt)
113 | 
114 |                 if len(source) >= self.batch_size or \
115 |                         len(target) >= self.batch_size:
116 |                     break
117 |         except IOError:
118 |             self.end_of_data = True
119 | 
120 |         if len(source) <= 0 or len(target) <= 0:
121 |             self.end_of_data = False
122 |             self.reset()
123 |             raise StopIteration
124 | 
125 |         return source, target
126 | 


--------------------------------------------------------------------------------
/session2/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./translate.py -n -p 8 \
12 | 	$HOME/models/model_session2.npz  \
13 | 	$HOME/data/europarl-v7.fr-en.en.tok.pkl \
14 | 	$HOME/data/europarl-v7.fr-en.fr.tok.pkl \
15 | 	$HOME/data/newstest2011.en.tok \
16 | 	./newstest2011.trans.fr.tok
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/session2/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/session2/train_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt_all.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/session2/train_nmt.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | import numpy
 5 | import os
 6 | 
 7 | from nmt import train
 8 | 
 9 | def main(job_id, params):
10 |     print params
11 |     validerr = train(saveto=params['model'][0],
12 |                      reload_=params['reload'][0],
13 |                      dim_word=params['dim_word'][0],
14 |                      dim=params['dim'][0],
15 |                      n_words=params['n-words'][0],
16 |                      n_words_src=params['n-words'][0],
17 |                      decay_c=params['decay-c'][0],
18 |                      clip_c=params['clip-c'][0],
19 |                      lrate=params['learning-rate'][0],
20 |                      optimizer=params['optimizer'][0],
21 |                      patience=1000,
22 |                      maxlen=50,
23 |                      batch_size=32,
24 |                      valid_batch_size=32,
25 |                      validFreq=100,
26 |                      dispFreq=10,
27 |                      saveFreq=100,
28 |                      sampleFreq=100,
29 |                      datasets=['../data/hal/train/tok/en',
30 |                                '../data/hal/train/tok/fr'],
31 |                      valid_datasets=['../data/hal/dev/tok/en',
32 |                                      '../data/hal/dev/tok/fr'],
33 |                      dictionaries=['../data/hal/train/tok/en.pkl',
34 |                                    '../data/hal/train/tok/fr.pkl'],
35 |                      use_dropout=params['use-dropout'][0],
36 |                      overwrite=False)
37 |     return validerr
38 | 
39 | if __name__ == '__main__':
40 |     main(0, {
41 |         'model': ['model_hal.npz'],
42 |         'dim_word': [512],
43 |         'dim': [1024],
44 |         'n-words': [30000],
45 |         'optimizer': ['adadelta'],
46 |         'decay-c': [0.],
47 |         'clip-c': [1.],
48 |         'use-dropout': [False],
49 |         'learning-rate': [0.0001],
50 |         'reload': [True]})
51 | 


--------------------------------------------------------------------------------
/session2/train_nmt_all.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from nmt import train
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     validerr = train(saveto=params['model'][0],
 9 |                                         reload_=params['reload'][0],
10 |                                         dim_word=params['dim_word'][0],
11 |                                         dim=params['dim'][0],
12 |                                         n_words=params['n-words'][0],
13 |                                         n_words_src=params['n-words'][0],
14 |                                         decay_c=params['decay-c'][0],
15 |                                         clip_c=params['clip-c'][0],
16 |                                         lrate=params['learning-rate'][0],
17 |                                         optimizer=params['optimizer'][0],
18 |                                         maxlen=50,
19 |                                         batch_size=32,
20 |                                         valid_batch_size=32,
21 | 					datasets=['/ichec/home/users/%s/data/all.en.concat.shuf.gz'%os.environ['USER'],
22 | 					'/ichec/home/users/%s/data/all.fr.concat.shuf.gz'%os.environ['USER']],
23 | 					valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'],
24 | 					'/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']],
25 | 					dictionaries=['/ichec/home/users/%s/data/all.en.concat.gz.pkl'%os.environ['USER'],
26 | 					'/ichec/home/users/%s/data/all.fr.concat.gz.pkl'%os.environ['USER']],
27 |                                         validFreq=5000,
28 |                                         dispFreq=10,
29 |                                         saveFreq=5000,
30 |                                         sampleFreq=1000,
31 |                                         use_dropout=params['use-dropout'][0],
32 |                                         overwrite=False)
33 |     return validerr
34 | 
35 | if __name__ == '__main__':
36 |     main(0, {
37 |         'model': ['/ichec/home/users/%s/models/model_session2_all.npz'%os.environ['USER']],
38 |         'dim_word': [500],
39 |         'dim': [1024],
40 |         'n-words': [30000],
41 |         'optimizer': ['adadelta'],
42 |         'decay-c': [0.],
43 |         'clip-c': [1.],
44 |         'use-dropout': [False],
45 |         'learning-rate': [0.0001],
46 |         'reload': [False]})
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/session2/translate.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import cPickle as pkl
  8 | 
  9 | from nmt import (build_sampler, gen_sample, load_params,
 10 |                  init_params, init_tparams)
 11 | 
 12 | from multiprocessing import Process, Queue
 13 | 
 14 | 
 15 | def translate_model(queue, rqueue, pid, model, options, k, normalize):
 16 | 
 17 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 18 |     from theano import shared
 19 |     trng = RandomStreams(1234)
 20 |     use_noise = shared(numpy.float32(0.))
 21 | 
 22 |     # allocate model parameters
 23 |     params = init_params(options)
 24 | 
 25 |     # load model parameters and set theano shared variables
 26 |     params = load_params(model, params)
 27 |     tparams = init_tparams(params)
 28 | 
 29 |     # word index
 30 |     f_init, f_next = build_sampler(tparams, options, trng, use_noise)
 31 | 
 32 |     def _translate(seq):
 33 |         # sample given an input sequence and obtain scores
 34 |         sample, score = gen_sample(tparams, f_init, f_next,
 35 |                                    numpy.array(seq).reshape([len(seq), 1]),
 36 |                                    options, trng=trng, k=k, maxlen=200,
 37 |                                    stochastic=False, argmax=False)
 38 | 
 39 |         # normalize scores according to sequence lengths
 40 |         if normalize:
 41 |             lengths = numpy.array([len(s) for s in sample])
 42 |             score = score / lengths
 43 |         sidx = numpy.argmin(score)
 44 |         return sample[sidx]
 45 | 
 46 |     while True:
 47 |         req = queue.get()
 48 |         if req is None:
 49 |             break
 50 | 
 51 |         idx, x = req[0], req[1]
 52 |         print pid, '-', idx
 53 |         seq = _translate(x)
 54 | 
 55 |         rqueue.put((idx, seq))
 56 | 
 57 |     return
 58 | 
 59 | 
 60 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
 61 |          normalize=False, n_process=5, chr_level=False):
 62 | 
 63 |     # load model model_options
 64 |     with open('%s.pkl' % model, 'rb') as f:
 65 |         options = pkl.load(f)
 66 | 
 67 |     # load source dictionary and invert
 68 |     with open(dictionary, 'rb') as f:
 69 |         word_dict = pkl.load(f)
 70 |     word_idict = dict()
 71 |     for kk, vv in word_dict.iteritems():
 72 |         word_idict[vv] = kk
 73 |     word_idict[0] = '<eos>'
 74 |     word_idict[1] = 'UNK'
 75 | 
 76 |     # load target dictionary and invert
 77 |     with open(dictionary_target, 'rb') as f:
 78 |         word_dict_trg = pkl.load(f)
 79 |     word_idict_trg = dict()
 80 |     for kk, vv in word_dict_trg.iteritems():
 81 |         word_idict_trg[vv] = kk
 82 |     word_idict_trg[0] = '<eos>'
 83 |     word_idict_trg[1] = 'UNK'
 84 | 
 85 |     # create input and output queues for processes
 86 |     queue = Queue()
 87 |     rqueue = Queue()
 88 |     processes = [None] * n_process
 89 |     for midx in xrange(n_process):
 90 |         processes[midx] = Process(
 91 |             target=translate_model,
 92 |             args=(queue, rqueue, midx, model, options, k, normalize))
 93 |         processes[midx].start()
 94 | 
 95 |     # utility function
 96 |     def _seqs2words(caps):
 97 |         capsw = []
 98 |         for cc in caps:
 99 |             ww = []
100 |             for w in cc:
101 |                 if w == 0:
102 |                     break
103 |                 ww.append(word_idict_trg[w])
104 |             capsw.append(' '.join(ww))
105 |         return capsw
106 | 
107 |     def _send_jobs(fname):
108 |         with open(fname, 'r') as f:
109 |             for idx, line in enumerate(f):
110 |                 if chr_level:
111 |                     words = list(line.decode('utf-8').strip())
112 |                 else:
113 |                     words = line.strip().split()
114 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
115 |                 x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
116 |                 x += [0]
117 |                 queue.put((idx, x))
118 |         return idx+1
119 | 
120 |     def _finish_processes():
121 |         for midx in xrange(n_process):
122 |             queue.put(None)
123 | 
124 |     def _retrieve_jobs(n_samples):
125 |         trans = [None] * n_samples
126 |         for idx in xrange(n_samples):
127 |             resp = rqueue.get()
128 |             trans[resp[0]] = resp[1]
129 |             if numpy.mod(idx, 10) == 0:
130 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
131 |         return trans
132 | 
133 |     print 'Translating ', source_file, '...'
134 |     n_samples = _send_jobs(source_file)
135 |     trans = _seqs2words(_retrieve_jobs(n_samples))
136 |     _finish_processes()
137 |     with open(saveto, 'w') as f:
138 |         print >>f, '\n'.join(trans)
139 |     print 'Done'
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument('-k', type=int, default=5)
145 |     parser.add_argument('-p', type=int, default=5)
146 |     parser.add_argument('-n', action="store_true", default=False)
147 |     parser.add_argument('-c', action="store_true", default=False)
148 |     parser.add_argument('model', type=str)
149 |     parser.add_argument('dictionary', type=str)
150 |     parser.add_argument('dictionary_target', type=str)
151 |     parser.add_argument('source', type=str)
152 |     parser.add_argument('saveto', type=str)
153 | 
154 |     args = parser.parse_args()
155 | 
156 |     main(args.model, args.dictionary, args.dictionary_target, args.source,
157 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
158 |          chr_level=args.c)
159 | 


--------------------------------------------------------------------------------
/session3/README.md:
--------------------------------------------------------------------------------
1 | Rescoring Attention-based encoder-decoder model using RNN Language Model
2 | 
3 | ## Training
4 | Change the hard-coded paths to data in `nmt.py` then run
5 | ```
6 | THEANO_FLAGS=device=gpu,floatX=float32 python train_nmt.py 
7 | ```
8 | 


--------------------------------------------------------------------------------
/session3/data_iterator.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pkl
 2 | import gzip
 3 | 
 4 | 
 5 | def fopen(filename, mode='r'):
 6 |     if filename.endswith('.gz'):
 7 |         return gzip.open(filename, mode)
 8 |     return open(filename, mode)
 9 | 
10 | 
11 | class TextIterator:
12 |     """Simple Bitext iterator."""
13 |     def __init__(self, source, target,
14 |                  source_dict, target_dict,
15 |                  batch_size=128,
16 |                  maxlen=100,
17 |                  n_words_source=-1,
18 |                  n_words_target=-1):
19 |         self.source = fopen(source, 'r')
20 |         self.target = fopen(target, 'r')
21 |         with open(source_dict, 'rb') as f:
22 |             self.source_dict = pkl.load(f)
23 |         with open(target_dict, 'rb') as f:
24 |             self.target_dict = pkl.load(f)
25 | 
26 |         self.batch_size = batch_size
27 |         self.maxlen = maxlen
28 | 
29 |         self.n_words_source = n_words_source
30 |         self.n_words_target = n_words_target
31 | 
32 |         self.end_of_data = False
33 | 
34 |     def __iter__(self):
35 |         return self
36 | 
37 |     def reset(self):
38 |         self.source.seek(0)
39 |         self.target.seek(0)
40 | 
41 |     def next(self):
42 |         if self.end_of_data:
43 |             self.end_of_data = False
44 |             self.reset()
45 |             raise StopIteration
46 | 
47 |         source = []
48 |         target = []
49 | 
50 |         try:
51 | 
52 |             # actual work here
53 |             while True:
54 | 
55 |                 # read from source file and map to word index
56 |                 ss = self.source.readline()
57 |                 if ss == "":
58 |                     raise IOError
59 |                 ss = ss.strip().split()
60 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
61 |                       for w in ss]
62 |                 if self.n_words_source > 0:
63 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
64 | 
65 |                 # read from source file and map to word index
66 |                 tt = self.target.readline()
67 |                 if tt == "":
68 |                     raise IOError
69 |                 tt = tt.strip().split()
70 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
71 |                       for w in tt]
72 |                 if self.n_words_target > 0:
73 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
74 | 
75 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
76 |                     continue
77 | 
78 |                 source.append(ss)
79 |                 target.append(tt)
80 | 
81 |                 if len(source) >= self.batch_size or \
82 |                         len(target) >= self.batch_size:
83 |                     break
84 |         except IOError:
85 |             self.end_of_data = True
86 | 
87 |         if len(source) <= 0 or len(target) <= 0:
88 |             self.end_of_data = False
89 |             self.reset()
90 |             raise StopIteration
91 | 
92 |         return source, target
93 | 


--------------------------------------------------------------------------------
/session3/rescore_with_lm.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import cPickle as pkl
  3 | import numpy
  4 | import theano
  5 | 
  6 | from collections import OrderedDict
  7 | from theano import tensor
  8 | 
  9 | import lm
 10 | 
 11 | profile = False
 12 | 
 13 | 
 14 | # load parameters
 15 | def load_params(path):
 16 |     pp = numpy.load(path)
 17 |     params = OrderedDict()
 18 |     for kk, vv in pp.iteritems():
 19 |         params[kk] = vv
 20 | 
 21 |     return params
 22 | 
 23 | 
 24 | # Loads a pickled dictionary
 25 | def load_dictionary(filename):
 26 |     with open(filename, 'rb') as f:
 27 |         word_dict = pkl.load(f)
 28 |     return word_dict
 29 | 
 30 | 
 31 | # Inverts a dictionary and ensures special tokens
 32 | def invert_dictionary(word_dict):
 33 |     word_idict = dict()
 34 |     for kk, vv in word_dict.iteritems():
 35 |         word_idict[vv] = kk
 36 |     word_idict[0] = '<eos>'
 37 |     word_idict[1] = 'UNK'
 38 |     return word_idict
 39 | 
 40 | 
 41 | # initialize Theano shared variables according to the initial parameters
 42 | def init_tparams(params):
 43 |     tparams = OrderedDict()
 44 |     for kk, pp in params.iteritems():
 45 |         tparams[kk] = theano.shared(params[kk], name=kk)
 46 |     return tparams
 47 | 
 48 | 
 49 | # layers: 'name': ('parameter initializer', 'feedforward')
 50 | layers = {'ff': ('lm.param_init_fflayer', 'lm.fflayer'),
 51 |           'gru': ('lm.param_init_gru', 'lm.gru_layer'),
 52 |           }
 53 | 
 54 | 
 55 | # Utility function to get layer props
 56 | def get_layer(name):
 57 |     fns = layers[name]
 58 |     return (eval(fns[0]), eval(fns[1]))
 59 | 
 60 | 
 61 | # build a sampler
 62 | def build_sampler(tparams, options):
 63 |     # x: 1 x 1
 64 |     y = tensor.vector('y_sampler', dtype='int64')
 65 |     init_state = tensor.matrix('init_state', dtype='float32')
 66 | 
 67 |     # if it's the first word, emb should be all zero
 68 |     emb = tensor.switch(y[:, None] < 0,
 69 |                         tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
 70 |                         tparams['Wemb'][y])
 71 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 72 |                                             prefix='encoder',
 73 |                                             mask=None,
 74 |                                             one_step=True,
 75 |                                             init_state=init_state)
 76 |     next_state = proj[0]
 77 | 
 78 |     logit_lstm = get_layer('ff')[1](tparams, next_state, options,
 79 |                                     prefix='ff_logit_lstm', activ='linear')
 80 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 81 |                                     prefix='ff_logit_prev', activ='linear')
 82 |     logit = tensor.tanh(logit_lstm+logit_prev)
 83 |     logit = get_layer('ff')[1](tparams, logit, options,
 84 |                                prefix='ff_logit', activ='linear')
 85 |     next_probs = tensor.nnet.softmax(logit)
 86 | 
 87 |     # next word probability
 88 |     print 'Building f_next..',
 89 |     inps = [y, init_state]
 90 |     outs = [next_probs, next_state]
 91 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
 92 |     print 'Done'
 93 | 
 94 |     return f_next
 95 | 
 96 | 
 97 | # Scores a given sequence with the language model
 98 | def score_seq(seq, f_next, options, normalize):
 99 | 
100 |     next_w = -1 * numpy.ones((1,)).astype('int64')
101 |     next_state = numpy.zeros((1, options['dim'])).astype('float32')
102 | 
103 |     seq_len = len(seq)
104 |     sample_score = 0
105 |     for ii in xrange(seq_len):
106 |         inps = [next_w, next_state]
107 |         ret = f_next(*inps)
108 |         next_p, next_state = ret[0], ret[1]
109 | 
110 |         # accumulate nll for each token
111 |         sample_score -= numpy.log(next_p[0, seq[ii]])
112 | 
113 |     if normalize:
114 |         sample_score /= seq_len
115 | 
116 |     return sample_score
117 | 
118 | 
119 | # Linearly interpolate between two scores using beta
120 | def shallow_fusion(score_lm, score_tm, beta, convex_comb):
121 |     if convex_comb:
122 |         return (1 - beta) * score_tm + (beta * score_lm)
123 |     return score_tm + (beta * score_lm)
124 | 
125 | 
126 | def main(model, model_options, dictionary_lm, dictionary_tm,
127 |          source, saveto, normalize=False, chr_level=False,
128 |          beta=0.5, convex_comb=False):
129 | 
130 |     # load model options
131 |     model_options = pkl.load(open(model_options))
132 | 
133 |     # reload parameters
134 |     print 'Loading language model..',
135 |     params = load_params(model)
136 |     tparams = init_tparams(params)
137 |     print 'Done'
138 | 
139 |     print 'Loading LM dictionary..',
140 |     word_dict_lm = load_dictionary(dictionary_lm)
141 |     print 'Done'
142 | 
143 |     print 'Loading TM dictionary..',
144 |     word_dict_tm = load_dictionary(dictionary_tm)
145 |     word_idict_tm = invert_dictionary(word_dict_tm)
146 |     print 'Done'
147 | 
148 |     f_next = build_sampler(tparams, model_options)
149 | 
150 |     # Create a cross dictionary from tm to lm
151 |     tm2lm_idx = {}
152 |     for idx, word in word_idict_tm.items():
153 |         tm2lm_idx[idx] = word_dict_lm.get(word, 1)
154 |     tm2lm_idx[0] = 0  # <eos>
155 |     tm2lm_idx[1] = 1  # UNK
156 | 
157 |     # Iterate over the n-best list generated by TM
158 |     print 'Rescoring..',
159 |     new_trans = []
160 |     nbest_idx = 0
161 |     with open(source, 'r') as f:
162 |         scores_in_nbest = []
163 |         trans_in_nbest = []
164 |         for idx, line in enumerate(f):
165 |             line_idx, trans, score_tm = line.strip().split('|||')
166 |             if chr_level:
167 |                 words = list(trans.decode('utf-8').strip())
168 |             else:
169 |                 words = line.strip().split()
170 |             x = map(lambda w: word_dict_tm[w]
171 |                     if w in word_dict_tm else 1, words)
172 |             x = map(lambda ii: ii if ii < model_options['n_words'] else 1, x)
173 |             x += [0]
174 | 
175 |             # Score the sequence with LM
176 |             x_lm = [tm2lm_idx[xx] for xx in x]
177 |             score_lm = score_seq(x_lm, f_next, model_options, normalize)
178 | 
179 |             # Take linear interpolation with Beta
180 |             new_score = shallow_fusion(score_lm, float(score_tm),
181 |                                        beta, convex_comb)
182 |             if int(line_idx) > nbest_idx:
183 |                 new_trans.append(
184 |                     trans_in_nbest[numpy.argmin(scores_in_nbest)])
185 | 
186 |                 scores_in_nbest = []
187 |                 trans_in_nbest = []
188 |                 nbest_idx += 1
189 |             else:
190 |                 scores_in_nbest.append(new_score)
191 |                 trans_in_nbest.append(trans)
192 | 
193 |     print 'Done'
194 |     print 'Saving to %s' % saveto
195 |     with open(saveto, 'w') as f:
196 |         print >>f, '\n'.join(new_trans)
197 |     print 'Done'
198 |     return
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     parser = argparse.ArgumentParser()
203 |     parser.add_argument('-b', '--beta', type=float, default=1.,
204 |                         help="Weight for language model score")
205 |     parser.add_argument('-n', action="store_true", default=False,
206 |                         help="Normalize wrt sequence length")
207 |     parser.add_argument('-c', action="store_true", default=False,
208 |                         help="Character level")
209 |     parser.add_argument('-x', action="store_true", default=False,
210 |                         help="Take convex combination using beta")
211 |     parser.add_argument('model', type=str)
212 |     parser.add_argument('model_options', type=str)
213 |     parser.add_argument('dictionary_lm', type=str,
214 |                         help='Dictionary of language model')
215 |     parser.add_argument('dictionary_tm', type=str,
216 |                         help='Target side dictionary of translation model')
217 |     parser.add_argument('source', type=str)
218 |     parser.add_argument('saveto', type=str)
219 | 
220 |     args = parser.parse_args()
221 | 
222 |     main(args.model, args.model_options,
223 |          args.dictionary_lm, args.dictionary_tm,
224 |          args.source, args.saveto,
225 |          normalize=args.n, chr_level=args.c, beta=args.beta,
226 |          convex_comb=args.x)
227 | 


--------------------------------------------------------------------------------
/session3/score_nbest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=48:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./rescore_with_lm.py -n -b 0.5 \
12 | 	${HOME}/models/model_session0.npz \
13 | 	${HOME}/models/model_session0.npz.pkl \
14 | 	${HOME}/data/wiki.tok.txt.gz.pkl \
15 | 	${HOME}/data/europarl-v7.fr-en.en.tok.pkl \
16 | 	./newstest2011.trans.en.tok \
17 |     ./newstest2011.trans.en.tok.rescored
18 | 
19 | 


--------------------------------------------------------------------------------
/session3/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=4:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./translate.py -n -p 1 -b 3 \
12 | 	${HOME}/models/model_session3.npz  \
13 | 	${HOME}/data/europarl-v7.fr-en.fr.tok.pkl \
14 | 	${HOME}/data/europarl-v7.fr-en.en.tok.pkl \
15 | 	${HOME}/newstest2011.en.tok \
16 | 	./newstest2011.trans.en.tok
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/session3/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session1_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | 
 9 | export THEANO_FLAGS=device=gpu,floatX=float32
10 | 
11 | cd $PBS_O_WORKDIR
12 | python ./train_nmt.py
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/session3/train_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt_all.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/session3/train_nmt.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from nmt import train
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     basedir = '/data/lisatmp3/firatorh/nmt/europarlv7'
 9 |     validerr = train(saveto=params['model'][0],
10 |                                         reload_=params['reload'][0],
11 |                                         dim_word=params['dim_word'][0],
12 |                                         dim=params['dim'][0],
13 |                                         n_words=params['n-words'][0],
14 |                                         n_words_src=params['n-words'][0],
15 |                                         decay_c=params['decay-c'][0],
16 |                                         clip_c=params['clip-c'][0],
17 |                                         lrate=params['learning-rate'][0],
18 |                                         optimizer=params['optimizer'][0],
19 |                                         maxlen=15,
20 |                                         batch_size=32,
21 |                                         valid_batch_size=32,
22 | 					datasets=['%s/europarl-v7.fr-en.fr.tok'%basedir,
23 | 					'%s/europarl-v7.fr-en.en.tok'%basedir],
24 | 					valid_datasets=['%s/newstest2011.fr.tok'%basedir,
25 | 					'%s/newstest2011.en.tok'%basedir],
26 | 					dictionaries=['%s/europarl-v7.fr-en.fr.tok.pkl'%basedir,
27 | 					'%s/europarl-v7.fr-en.en.tok.pkl'%basedir],
28 |                                         validFreq=500000,
29 |                                         dispFreq=1,
30 |                                         saveFreq=100,
31 |                                         sampleFreq=50,
32 |                                         use_dropout=params['use-dropout'][0],
33 |                                         overwrite=False)
34 |     return validerr
35 | 
36 | if __name__ == '__main__':
37 |     basedir = '/data/lisatmp3/firatorh/nmt/europarlv7'
38 |     main(0, {
39 |         'model': ['%s/models/model_session3.npz'%basedir],
40 |         'dim_word': [150],
41 |         'dim': [124],
42 |         'n-words': [3000],
43 |         'optimizer': ['adadelta'],
44 |         'decay-c': [0.],
45 |         'clip-c': [1.],
46 |         'use-dropout': [False],
47 |         'learning-rate': [0.0001],
48 |         'reload': [False]})
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/session3/train_nmt_all.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from nmt import train
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     validerr = train(saveto=params['model'][0],
 9 |                                         reload_=params['reload'][0],
10 |                                         dim_word=params['dim_word'][0],
11 |                                         dim=params['dim'][0],
12 |                                         n_words=params['n-words'][0],
13 |                                         n_words_src=params['n-words'][0],
14 |                                         decay_c=params['decay-c'][0],
15 |                                         clip_c=params['clip-c'][0],
16 |                                         lrate=params['learning-rate'][0],
17 |                                         optimizer=params['optimizer'][0],
18 |                                         maxlen=50,
19 |                                         batch_size=32,
20 |                                         valid_batch_size=32,
21 | 					datasets=['/ichec/home/users/%s/data/all.en.concat.shuf.gz'%os.environ['USER'],
22 | 					'/ichec/home/users/%s/data/all.fr.concat.shuf.gz'%os.environ['USER']],
23 | 					valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'],
24 | 					'/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']],
25 | 					dictionaries=['/ichec/home/users/%s/data/all.en.concat.gz.pkl'%os.environ['USER'],
26 | 					'/ichec/home/users/%s/data/all.fr.concat.gz.pkl'%os.environ['USER']],
27 |                                         validFreq=5000,
28 |                                         dispFreq=10,
29 |                                         saveFreq=5000,
30 |                                         sampleFreq=1000,
31 |                                         use_dropout=params['use-dropout'][0],
32 |                                         overwrite=False)
33 |     return validerr
34 | 
35 | if __name__ == '__main__':
36 |     main(0, {
37 |         'model': ['/ichec/home/users/%s/models/model_session2_all.npz'%os.environ['USER']],
38 |         'dim_word': [500],
39 |         'dim': [1024],
40 |         'n-words': [30000],
41 |         'optimizer': ['adadelta'],
42 |         'decay-c': [0.],
43 |         'clip-c': [1.],
44 |         'use-dropout': [False],
45 |         'learning-rate': [0.0001],
46 |         'reload': [False]})
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/session3/translate.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import cPickle as pkl
  8 | 
  9 | from nmt import (build_sampler, gen_sample, load_params,
 10 |                  init_params, init_tparams)
 11 | 
 12 | from multiprocessing import Process, Queue
 13 | 
 14 | 
 15 | def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best):
 16 | 
 17 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 18 |     from theano import shared
 19 |     trng = RandomStreams(1234)
 20 |     use_noise = shared(numpy.float32(0.))
 21 | 
 22 |     # allocate model parameters
 23 |     params = init_params(options)
 24 | 
 25 |     # load model parameters and set theano shared variables
 26 |     params = load_params(model, params)
 27 |     tparams = init_tparams(params)
 28 | 
 29 |     # word index
 30 |     f_init, f_next = build_sampler(tparams, options, trng, use_noise)
 31 | 
 32 |     def _translate(seq):
 33 |         # sample given an input sequence and obtain scores
 34 |         sample, score = gen_sample(tparams, f_init, f_next,
 35 |                                    numpy.array(seq).reshape([len(seq), 1]),
 36 |                                    options, trng=trng, k=k, maxlen=200,
 37 |                                    stochastic=False, argmax=False)
 38 | 
 39 |         # normalize scores according to sequence lengths
 40 |         if normalize:
 41 |             lengths = numpy.array([len(s) for s in sample])
 42 |             score = score / lengths
 43 |         if n_best > 1:
 44 |             sidx = numpy.argsort(score)[:n_best]
 45 |         else:
 46 |             sidx = numpy.argmin(score)
 47 |         return numpy.array(sample)[sidx], numpy.array(score)[sidx]
 48 | 
 49 |     while True:
 50 |         req = queue.get()
 51 |         if req is None:
 52 |             break
 53 | 
 54 |         idx, x = req[0], req[1]
 55 |         print pid, '-', idx
 56 |         seq, scores = _translate(x)
 57 | 
 58 |         rqueue.put((idx, seq, scores))
 59 | 
 60 |     return
 61 | 
 62 | 
 63 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
 64 |          normalize=False, n_process=5, chr_level=False, n_best=1):
 65 | 
 66 |     # load model model_options
 67 |     with open('%s.pkl' % model, 'rb') as f:
 68 |         options = pkl.load(f)
 69 | 
 70 |     # load source dictionary and invert
 71 |     with open(dictionary, 'rb') as f:
 72 |         word_dict = pkl.load(f)
 73 |     word_idict = dict()
 74 |     for kk, vv in word_dict.iteritems():
 75 |         word_idict[vv] = kk
 76 |     word_idict[0] = '<eos>'
 77 |     word_idict[1] = 'UNK'
 78 | 
 79 |     # load target dictionary and invert
 80 |     with open(dictionary_target, 'rb') as f:
 81 |         word_dict_trg = pkl.load(f)
 82 |     word_idict_trg = dict()
 83 |     for kk, vv in word_dict_trg.iteritems():
 84 |         word_idict_trg[vv] = kk
 85 |     word_idict_trg[0] = '<eos>'
 86 |     word_idict_trg[1] = 'UNK'
 87 | 
 88 |     # create input and output queues for processes
 89 |     queue = Queue()
 90 |     rqueue = Queue()
 91 |     processes = [None] * n_process
 92 |     for midx in xrange(n_process):
 93 |         processes[midx] = Process(
 94 |             target=translate_model,
 95 |             args=(queue, rqueue, midx, model, options, k, normalize, n_best))
 96 |         processes[midx].start()
 97 | 
 98 |     # utility function
 99 |     def _seqs2words(caps):
100 |         capsw = []
101 |         for cc in caps:
102 |             ww = []
103 |             for w in cc:
104 |                 if w == 0:
105 |                     break
106 |                 ww.append(word_idict_trg[w])
107 |             capsw.append(' '.join(ww))
108 |         return capsw
109 | 
110 |     def _send_jobs(fname):
111 |         with open(fname, 'r') as f:
112 |             for idx, line in enumerate(f):
113 |                 if chr_level:
114 |                     words = list(line.decode('utf-8').strip())
115 |                 else:
116 |                     words = line.strip().split()
117 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
118 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
119 |                 x += [0]
120 |                 queue.put((idx, x))
121 |         return idx+1
122 | 
123 |     def _finish_processes():
124 |         for midx in xrange(n_process):
125 |             queue.put(None)
126 | 
127 |     def _retrieve_jobs(n_samples):
128 |         trans = [None] * n_samples
129 |         scores = [None] * n_samples
130 |         for idx in xrange(n_samples):
131 |             resp = rqueue.get()
132 |             trans[resp[0]] = resp[1]
133 |             scores[resp[0]] = resp[2]
134 |             if numpy.mod(idx, 10) == 0:
135 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
136 |         return trans, scores
137 | 
138 |     print 'Translating ', source_file, '...'
139 |     n_samples = _send_jobs(source_file)
140 |     trans, scores = _retrieve_jobs(n_samples)
141 |     _finish_processes()
142 | 
143 |     if n_best == 1:
144 |         trans = _seqs2words(trans)
145 |     else:
146 |         n_best_trans = []
147 |         for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)):
148 |             sentences = _seqs2words(n_best_tr)
149 |             for ids, trans_ in enumerate(sentences):
150 |                 n_best_trans.append(
151 |                     '|||'.join(
152 |                         ['{}'.format(idx), trans_,
153 |                          '{}'.format(score_[ids])]))
154 |         trans = n_best_trans
155 | 
156 |     with open(saveto, 'w') as f:
157 |         print >>f, '\n'.join(trans)
158 |     print 'Done'
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     parser = argparse.ArgumentParser()
163 |     parser.add_argument('-k', type=int, default=5, help="Beam size")
164 |     parser.add_argument('-p', type=int, default=5, help="Number of processes")
165 |     parser.add_argument('-n', action="store_true", default=False,
166 |                         help="Normalize wrt sequence length")
167 |     parser.add_argument('-c', action="store_true", default=False,
168 |                         help="Character level")
169 |     parser.add_argument('-b', type=int, default=1, help="Output n-best list")
170 |     parser.add_argument('model', type=str)
171 |     parser.add_argument('dictionary', type=str)
172 |     parser.add_argument('dictionary_target', type=str)
173 |     parser.add_argument('source', type=str)
174 |     parser.add_argument('saveto', type=str)
175 | 
176 |     args = parser.parse_args()
177 | 
178 |     main(args.model, args.dictionary, args.dictionary_target, args.source,
179 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
180 |          chr_level=args.c, n_best=args.b)
181 | 


--------------------------------------------------------------------------------