├── programs
    ├── MergeAndSplit.class
    ├── CleanChineseFile.class
    ├── LenRatioRemover.class
    ├── SplitChineseFile.class
    ├── ChineseSpecialRemover.class
    ├── SpecialSentRemoverENDE.class
    ├── CleanChineseFile.java
    ├── SplitChineseFile.java
    ├── LenRatioRemover.java
    ├── MergeAndSplit.java
    ├── SpecialSentRemoverENDE.java
    └── ChineseSpecialRemover.java
├── scripts
    ├── nonbreaking_prefixes
    │   ├── README.txt
    │   ├── nonbreaking_prefix.ro
    │   ├── nonbreaking_prefix.ga
    │   ├── nonbreaking_prefix.sv
    │   ├── nonbreaking_prefix.ca
    │   ├── nonbreaking_prefix.sl
    │   ├── nonbreaking_prefix.yue
    │   ├── nonbreaking_prefix.zh
    │   ├── nonbreaking_prefix.es
    │   ├── nonbreaking_prefix.lv
    │   ├── nonbreaking_prefix.fr
    │   ├── nonbreaking_prefix.en
    │   ├── nonbreaking_prefix.fi
    │   ├── nonbreaking_prefix.hu
    │   ├── nonbreaking_prefix.nl
    │   ├── nonbreaking_prefix.is
    │   ├── nonbreaking_prefix.it
    │   ├── nonbreaking_prefix.ru
    │   ├── nonbreaking_prefix.pl
    │   ├── nonbreaking_prefix.pt
    │   ├── nonbreaking_prefix.ta
    │   ├── nonbreaking_prefix.de
    │   ├── nonbreaking_prefix.cs
    │   ├── nonbreaking_prefix.sk
    │   ├── nonbreaking_prefix.lt
    │   └── nonbreaking_prefix.el
    ├── deescape-special-chars.perl
    ├── input-from-sgm.perl
    ├── shuffle.py
    ├── normalize-punctuation.perl
    ├── truecase.perl
    ├── tokenizeChinese.py
    └── tokenizer.perl
├── README.md
├── .gitignore
├── bpe
    ├── generate_vocab.py
    ├── learn_joint_bpe_and_vocab.py
    ├── learn_bpe.py
    └── apply_bpe.py
├── fetch_wmt2018_zhen.sh
├── fetch_wmt2017_ende.sh
└── LICENSE


/programs/MergeAndSplit.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/MergeAndSplit.class


--------------------------------------------------------------------------------
/programs/CleanChineseFile.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/CleanChineseFile.class


--------------------------------------------------------------------------------
/programs/LenRatioRemover.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/LenRatioRemover.class


--------------------------------------------------------------------------------
/programs/SplitChineseFile.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/SplitChineseFile.class


--------------------------------------------------------------------------------
/programs/ChineseSpecialRemover.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/ChineseSpecialRemover.class


--------------------------------------------------------------------------------
/programs/SpecialSentRemoverENDE.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/SpecialSentRemoverENDE.class


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MT-data-processing
 2 | This repository contains scripts for the shared translation task at the Statistical Machine Translation.
 3 | Now, it deals with:
 4 | - WMT2017 EN<->DE
 5 | 
 6 | ## References
 7 | - [BPE](https://github.com/rsennrich/subword-nmt)
 8 | - [mosesdecoder](https://github.com/moses-smt/mosesdecoder)
 9 | 
10 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.ga:
--------------------------------------------------------------------------------
 1 | 
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | Á
29 | É
30 | Í
31 | Ó
32 | Ú
33 | 
34 | Uacht
35 | Dr
36 | B.Arch
37 | 
38 | m.sh
39 | .i
40 | Co
41 | Cf
42 | cf
43 | i.e
44 | r
45 | Chr
46 | lch #NUMERIC_ONLY#
47 | lgh #NUMERIC_ONLY#
48 | uimh #NUMERIC_ONLY#
49 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/scripts/deescape-special-chars.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | while(<STDIN>) {
10 |   s/\&bar;/\|/g;   # factor separator (legacy)
11 |   s/\&#124;/\|/g;  # factor separator
12 |   s/\&lt;/\</g;    # xml
13 |   s/\&gt;/\>/g;    # xml
14 |   s/\&bra;/\[/g;   # syntax non-terminal (legacy)
15 |   s/\&ket;/\]/g;   # syntax non-terminal (legacy)
16 |   s/\&quot;/\"/g;  # xml
17 |   s/\&apos;/\'/g;  # xml
18 |   s/\&#91;/\[/g;   # syntax non-terminal
19 |   s/\&#93;/\]/g;   # syntax non-terminal
20 |   s/\&amp;/\&/g;   # escape escape
21 |   print $_;
22 | }
23 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.yue:
--------------------------------------------------------------------------------
 1 | #
 2 | # Cantonese (Chinese)
 3 | #
 4 | # Anything in this file, followed by a period, 
 5 | # does NOT indicate an end-of-sentence marker.
 6 | #
 7 | # English/Euro-language given-name initials (appearing in
 8 | # news, periodicals, etc.)
 9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 | 
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.zh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Mandarin (Chinese)
 3 | #
 4 | # Anything in this file, followed by a period, 
 5 | # does NOT indicate an end-of-sentence marker.
 6 | #
 7 | # English/Euro-language given-name initials (appearing in
 8 | # news, periodicals, etc.)
 9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 | 
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 | 


--------------------------------------------------------------------------------
/scripts/input-from-sgm.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt")
10 |     unless scalar @ARGV == 0;
11 | 
12 | while(my $line = <STDIN>) {
13 |     chop($line);
14 |     while ($line =~ /<seg[^>]+>\s*$/i) {
15 | 	my $next_line = <STDIN>;
16 | 	$line .= $next_line;
17 | 	chop($line);
18 |     }
19 |     while ($line =~ /<seg[^>]+>\s*(.*)\s*$/i &&
20 | 	   $line !~ /<seg[^>]+>\s*(.*)\s*<\/seg>/i) {
21 | 	my $next_line = <STDIN>;
22 | 	$line .= $next_line;
23 | 	chop($line);
24 |     }
25 |     if ($line =~ /<seg[^>]+>\s*(.*)\s*<\/seg>/i) {
26 | 	my $input = $1;
27 | 	$input =~ s/\s+/ /g;
28 | 	$input =~ s/^ //g;
29 | 	$input =~ s/ $//g;
30 | 	print $input."\n";
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/scripts/shuffle.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import numpy
 4 | 
 5 | 
 6 | def shuffle_data(from_binding, to_binding):
 7 |     lines_list = []
 8 |     fps = []
 9 |     fws = []
10 |     for idx in range(len(from_binding)):
11 |         lines_list.append([])
12 |         fps.append(open(from_binding[idx], "r"))
13 | 
14 |     for zip_lines in zip(*fps):
15 |         for idx in range(len(zip_lines)):
16 |             lines_list[idx].append(zip_lines[idx].strip())
17 |     for fp in fps:
18 |         fp.close()
19 |     for idx in range(len(to_binding)):
20 |         fws.append(open(to_binding[idx], "w"))
21 |     rands = numpy.arange(len(lines_list[0]))
22 |     numpy.random.shuffle(rands)
23 |     for i in rands:
24 |         for idx in range(len(lines_list)):
25 |             fws[idx].write(lines_list[idx][i] + "\n")
26 |     for fw in fws:
27 |         fw.close()
28 | 
29 | 
30 | froms = sys.argv[1]
31 | tos = sys.argv[2]
32 | 
33 | shuffle_data(froms.strip().split(","), tos.strip().split(","))
34 | 


--------------------------------------------------------------------------------
/programs/CleanChineseFile.java:
--------------------------------------------------------------------------------
 1 | import java.io.*;
 2 | 
 3 | 
 4 | public class CleanChineseFile {
 5 |     public static void main(String[] args) throws Exception {
 6 |         if (args.length < 2) {
 7 |             System.out.println("Usage: java SplitChineseFile in out");
 8 |             return;
 9 |         }
10 |         String inFile = args[0];
11 |         String outFile = args[1];
12 | 
13 | 
14 |         BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "utf-8"));
15 |         BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), "utf-8"));
16 |         String line;
17 |         while ((line = br.readLine()) != null) {
18 |             String[] tokens = line.trim().split(" +");
19 |             StringBuffer sb = new StringBuffer();
20 |             for(String tok: tokens){
21 |                 sb.append(tok +" ");
22 |             }
23 | 
24 |             bw.write(sb.toString().trim() + "\n");
25 |         }
26 |         br.close();
27 |         bw.close();
28 | 
29 | 
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/programs/SplitChineseFile.java:
--------------------------------------------------------------------------------
 1 | import java.io.*;
 2 | 
 3 | public class SplitChineseFile {
 4 |     public static void main(String[] args) throws Exception {
 5 |         if (args.length < 3) {
 6 |             System.out.println("Usage: java SplitChineseFile mergedfile src trg");
 7 |             return;
 8 |         }
 9 |         String mergedFile = args[0];
10 |         String srcFile = args[1];
11 |         String trgFile = args[2];
12 | 
13 | 
14 |         BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mergedFile), "utf-8"));
15 |         BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcFile), "utf-8"));
16 |         BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgFile), "utf-8"));
17 |         String line;
18 |         while ((line = br.readLine()) != null) {
19 |             String[] tokens = line.trim().split("\t");
20 |             bwSrc.write(tokens[0].trim() + "\n");
21 |             bwTrg.write(tokens[1].trim() + "\n");
22 |         }
23 | 
24 |         br.close();
25 |         bwSrc.close();
26 |         bwTrg.close();
27 | 
28 | 
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | #a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/programs/LenRatioRemover.java:
--------------------------------------------------------------------------------
 1 | import java.io.*;
 2 | 
 3 | public class LenRatioRemover {
 4 | 
 5 |     public static void main(String[] args) throws Exception {
 6 |         if (args.length < 7) {
 7 |             System.out.println("Usage: java LenRatioRemover src trg src_div_trg_max src_div_trg_min src_output trg_output removed_to_file");
 8 |             return;
 9 |         }
10 |         String srcFile = args[0];
11 |         String trgFile = args[1];
12 |         double maxRatio = Double.parseDouble(args[2]);
13 |         double minRatio = Double.parseDouble(args[3]);
14 |         String srcOutputFile = args[4];
15 |         String trgOutputFile = args[5];
16 |         String removedOutputFile = args[6];
17 | 
18 |         BufferedReader brSrc = new BufferedReader(new InputStreamReader(new FileInputStream(srcFile), "utf-8"));
19 |         BufferedReader brTrg = new BufferedReader(new InputStreamReader(new FileInputStream(trgFile), "utf-8"));
20 |         BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcOutputFile), "utf-8"));
21 |         BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgOutputFile), "utf-8"));
22 |         BufferedWriter bwRem = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(removedOutputFile), "utf-8"));
23 | 
24 |         String src, trg;
25 | 
26 |         while ((src = brSrc.readLine()) != null) {
27 |             trg = brTrg.readLine();
28 |             double val = (double) src.trim().split(" ").length / (double) trg.trim().split(" ").length;
29 |             if (val < minRatio || val > maxRatio) {
30 |                 bwRem.write(src + " ||| " + trg + "\n");
31 |             } else {
32 |                 bwSrc.write(src + "\n");
33 |                 bwTrg.write(trg + "\n");
34 |             }
35 |         }
36 | 
37 | 
38 |         brSrc.close();
39 |         brTrg.close();
40 |         bwSrc.close();
41 |         bwTrg.close();
42 |         bwRem.close();
43 |     }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/programs/MergeAndSplit.java:
--------------------------------------------------------------------------------
 1 | import java.io.*;
 2 | 
 3 | public class MergeAndSplit {
 4 | 
 5 |     public static void main(String[] args) throws Exception {
 6 |         if (args.length < 4) {
 7 |             System.out.println("Usage: java MergeAndSplit type src trg merged");
 8 |             return;
 9 |         }
10 |         String type = args[0];
11 |         String srcFile = args[1];
12 |         String trgFile = args[2];
13 |         String mergedFile = args[3];
14 | 
15 |         if (type.equals("merge")) {
16 |             BufferedReader brSrc = new BufferedReader(new InputStreamReader(new FileInputStream(srcFile), "utf-8"));
17 |             BufferedReader brTrg = new BufferedReader(new InputStreamReader(new FileInputStream(trgFile), "utf-8"));
18 |             BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(mergedFile), "utf-8"));
19 | 
20 |             String src, trg;
21 | 
22 |             while ((src = brSrc.readLine()) != null) {
23 |                 trg = brTrg.readLine();
24 |                 bw.write(src + " ||| " + trg + "\n");
25 |             }
26 | 
27 | 
28 |             brSrc.close();
29 |             brTrg.close();
30 |             bw.close();
31 |         } else if (type.equals("split")) {
32 |             BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mergedFile), "utf-8"));
33 |             BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcFile), "utf-8"));
34 |             BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgFile), "utf-8"));
35 |             String line;
36 |             while ((line = br.readLine()) != null) {
37 |                 String[] tokens = line.trim().split(" \\|\\|\\| ");
38 |                 bwSrc.write(tokens[0] + "\n");
39 |                 bwTrg.write(tokens[1] + "\n");
40 |             }
41 | 
42 |             br.close();
43 |             bwSrc.close();
44 |             bwTrg.close();
45 |         } else {
46 |             System.out.println("Unrecognized type, which should be merge or split.");
47 |         }
48 | 
49 |     }
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/scripts/normalize-punctuation.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | my $language = "en";
10 | my $PENN = 0;
11 | 
12 | while (@ARGV) {
13 |     $_ = shift;
14 |     /^-b$/ && ($| = 1, next); # not buffered (flush each line)
15 |     /^-l$/ && ($language = shift, next);
16 |     /^[^\-]/ && ($language = $_, next);
17 |   	/^-penn$/ && ($PENN = 1, next);
18 | }
19 | 
20 | while(<STDIN>) {
21 |     s/\r//g;
22 |     # remove extra spaces
23 |     s/\(/ \(/g;
24 |     s/\)/\) /g; s/ +/ /g;
25 |     s/\) ([\.\!\:\?\;\,])/\)$1/g;
26 |     s/\( /\(/g;
27 |     s/ \)/\)/g;
28 |     s/(\d) \%/$1\%/g;
29 |     s/ :/:/g;
30 |     s/ ;/;/g;
31 |     # normalize unicode punctuation
32 |     if ($PENN == 0) {
33 |       s/\`/\'/g;
34 |       s/\'\'/ \" /g;
35 |     }
36 | 
37 |     s/„/\"/g;
38 |     s/“/\"/g;
39 |     s/”/\"/g;
40 |     s/–/-/g;
41 |     s/—/ - /g; s/ +/ /g;
42 |     s/´/\'/g;
43 |     s/([a-z])‘([a-z])/$1\'$2/gi;
44 |     s/([a-z])’([a-z])/$1\'$2/gi;
45 |     s/‘/\"/g;
46 |     s/‚/\"/g;
47 |     s/’/\"/g;
48 |     s/''/\"/g;
49 |     s/´´/\"/g;
50 |     s/…/.../g;
51 |     # French quotes
52 |     s/ « / \"/g;
53 |     s/« /\"/g;
54 |     s/«/\"/g;
55 |     s/ » /\" /g;
56 |     s/ »/\"/g;
57 |     s/»/\"/g;
58 |     # handle pseudo-spaces
59 |     s/ \%/\%/g;
60 |     s/nº /nº /g;
61 |     s/ :/:/g;
62 |     s/ ºC/ ºC/g;
63 |     s/ cm/ cm/g;
64 |     s/ \?/\?/g;
65 |     s/ \!/\!/g;
66 |     s/ ;/;/g;
67 |     s/, /, /g; s/ +/ /g;
68 | 
69 |     # English "quotation," followed by comma, style
70 |     if ($language eq "en") {
71 | 	s/\"([,\.]+)/$1\"/g;
72 |     }
73 |     # Czech is confused
74 |     elsif ($language eq "cs" || $language eq "cz") {
75 |     }
76 |     # German/Spanish/French "quotation", followed by comma, style
77 |     else {
78 | 	s/,\"/\",/g;	
79 | 	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
80 |     }
81 | 
82 | 
83 |     if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
84 | 	s/(\d) (\d)/$1,$2/g;
85 |     }
86 |     else {
87 | 	s/(\d) (\d)/$1.$2/g;
88 |     }
89 |     print $_;
90 | }
91 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 | 
211 | 


--------------------------------------------------------------------------------
/bpe/generate_vocab.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Copyright 2017 Google Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #pylint: disable=invalid-name
17 | """
18 | Generate vocabulary for a tokenized text file.
19 | """
20 | 
21 | import sys
22 | import argparse
23 | import collections
24 | import logging
25 | 
26 | parser = argparse.ArgumentParser(
27 |     description="Generate vocabulary for a tokenized text file.")
28 | parser.add_argument(
29 |     "--min_frequency",
30 |     dest="min_frequency",
31 |     type=int,
32 |     default=0,
33 |     help="Minimum frequency of a word to be included in the vocabulary.")
34 | parser.add_argument(
35 |     "--max_vocab_size",
36 |     dest="max_vocab_size",
37 |     type=int,
38 |     help="Maximum number of tokens in the vocabulary")
39 | parser.add_argument(
40 |     "--downcase",
41 |     dest="downcase",
42 |     type=bool,
43 |     help="If set to true, downcase all text before processing.",
44 |     default=False)
45 | parser.add_argument(
46 |     "infile",
47 |     nargs="?",
48 |     type=argparse.FileType("r"),
49 |     default=sys.stdin,
50 |     help="Input tokenized text file to be processed.")
51 | parser.add_argument(
52 |     "--delimiter",
53 |     dest="delimiter",
54 |     type=str,
55 |     default=" ",
56 |     help="Delimiter character for tokenizing. Use \" \" and \"\" for word and char level respectively."
57 | )
58 | args = parser.parse_args()
59 | 
60 | # Counter for all tokens in the vocabulary
61 | cnt = collections.Counter()
62 | 
63 | for line in args.infile:
64 |   if args.downcase:
65 |     line = line.lower()
66 |   if args.delimiter == "":
67 |     tokens = list(line.strip())
68 |   else:
69 |     tokens = line.strip().split(args.delimiter)
70 |   tokens = [_ for _ in tokens if len(_) > 0]
71 |   cnt.update(tokens)
72 | 
73 | logging.info("Found %d unique tokens in the vocabulary.", len(cnt))
74 | 
75 | # Filter tokens below the frequency threshold
76 | if args.min_frequency > 0:
77 |   filtered_tokens = [(w, c) for w, c in cnt.most_common()
78 |                      if c >= args.min_frequency]
79 |   cnt = collections.Counter(dict(filtered_tokens))
80 | 
81 | logging.info("Found %d unique tokens with frequency > %d.",
82 |              len(cnt), args.min_frequency)
83 | 
84 | # Sort tokens by 1. frequency 2. lexically to break ties
85 | word_with_counts = cnt.most_common()
86 | word_with_counts = sorted(
87 |     word_with_counts, key=lambda x: (x[1], x[0]), reverse=True)
88 | 
89 | # Take only max-vocab
90 | if args.max_vocab_size is not None:
91 |   word_with_counts = word_with_counts[:args.max_vocab_size]
92 | 
93 | for word, count in word_with_counts:
94 |   print("{}\t{}".format(word, count))
95 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/programs/SpecialSentRemoverENDE.java:
--------------------------------------------------------------------------------
  1 | import java.io.*;
  2 | 
  3 | public class SpecialSentRemoverENDE {
  4 | 
  5 |     public static boolean isNumber(String word) {
  6 |         try {
  7 |             Double.parseDouble(word);
  8 |         } catch (Exception e) {
  9 |             return false;
 10 |         }
 11 |         return true;
 12 |     }
 13 | 
 14 |     public static boolean isCommonWord(String word) {
 15 |         for (int i = 0; i < word.length(); ++i) {
 16 |             char c;
 17 |             if (i == 0) {
 18 |                 c = word.toLowerCase().charAt(0);
 19 |             } else {
 20 |                 c = word.charAt(i);
 21 |             }
 22 |             if (c <= 'z' && c >= 'a') {
 23 |                 continue;
 24 |             } else {
 25 |                 return false;
 26 |             }
 27 |         }
 28 |         return true;
 29 |     }
 30 | 
 31 |     public static boolean isCapital(String word) {
 32 |         for (int i = 0; i < word.length(); ++i) {
 33 |             if (word.charAt(i) >= 'A' && word.charAt(i) <= 'Z') {
 34 |                 continue;
 35 |             } else {
 36 |                 return false;
 37 |             }
 38 |         }
 39 |         return true;
 40 |     }
 41 | 
 42 |     public static boolean isAlmostNumberCap(String sentence) {
 43 |         String[] tokens = sentence.trim().split(" ");
 44 |         int cnt = 0;
 45 |         for (String word : tokens) {
 46 |             if (isNumber(word) || isCapital(word)) ++cnt;
 47 |         }
 48 |         if ((double) cnt / (double) tokens.length >= 0.5) {
 49 |             return true;
 50 |         }
 51 |         return false;
 52 |     }
 53 | 
 54 |     public static boolean isUglySentence(String sentence) {
 55 |         String[] tokens = sentence.trim().split(" ");
 56 |         int cnt = 0;
 57 |         for (String word : tokens) {
 58 |             if (!isCommonWord(word)) ++cnt;
 59 |         }
 60 |         if ((double) cnt / (double) tokens.length >=0.5) {
 61 |             return true;
 62 |         }
 63 |         return false;
 64 |     }
 65 | 
 66 |     public static void main(String[] args) throws Exception {
 67 |         if (args.length < 5) {
 68 |             System.out.println("Usage: java SpecialSentRemoverENDE en de en_output de_output removed_to_file");
 69 |             return;
 70 |         }
 71 |         String enFile = args[0];
 72 |         String deFile = args[1];
 73 |         String enOutputFile = args[2];
 74 |         String deOutputFile = args[3];
 75 |         String removedOutputFile = args[4];
 76 | 
 77 |         BufferedReader brEn = new BufferedReader(new InputStreamReader(new FileInputStream(enFile), "utf-8"));
 78 |         BufferedReader brDe = new BufferedReader(new InputStreamReader(new FileInputStream(deFile), "utf-8"));
 79 |         BufferedWriter bwEn = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(enOutputFile), "utf-8"));
 80 |         BufferedWriter bwDe = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(deOutputFile), "utf-8"));
 81 |         BufferedWriter bwRem = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(removedOutputFile), "utf-8"));
 82 | 
 83 |         String en, de;
 84 | 
 85 |         while ((en = brEn.readLine()) != null) {
 86 |             de = brDe.readLine();
 87 |             if (isUglySentence(en) || isAlmostNumberCap(de)) {
 88 |                 bwRem.write(en + " ||| " + de + "\n");
 89 |             } else {
 90 |                 bwEn.write(en + "\n");
 91 |                 bwDe.write(de + "\n");
 92 |             }
 93 |         }
 94 | 
 95 |         brEn.close();
 96 |         brDe.close();
 97 |         bwEn.close();
 98 |         bwDe.close();
 99 |         bwRem.close();
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/scripts/truecase.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
  7 | 
  8 | use warnings;
  9 | use strict;
 10 | use Getopt::Long "GetOptions";
 11 | 
 12 | binmode(STDIN, ":utf8");
 13 | binmode(STDOUT, ":utf8");
 14 | 
 15 | # apply switches
 16 | # ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known
 17 | my ($MODEL, $UNBUFFERED, $ASR);
 18 | die("truecase.perl --model MODEL [-b] [-a] < in > out")
 19 |     unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR)
 20 |     && defined($MODEL);
 21 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 22 | my $asr = 0;
 23 | if (defined($ASR) && $ASR) { $asr = 1; }
 24 | 
 25 | my (%BEST,%KNOWN);
 26 | open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
 27 | binmode(MODEL, ":utf8");
 28 | while(<MODEL>) {
 29 |   my ($word,@OPTIONS) = split;
 30 |   $BEST{ lc($word) } = $word;
 31 |   if ($asr == 0) {
 32 |     $KNOWN{ $word } = 1;
 33 |     for(my $i=1;$i<$#OPTIONS;$i+=2) {
 34 |       $KNOWN{ $OPTIONS[$i] } = 1;
 35 |     }
 36 |   }
 37 | }
 38 | close(MODEL);
 39 | 
 40 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
 41 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
 42 | 
 43 | while(<STDIN>) {
 44 |   chop;
 45 |   my ($WORD,$MARKUP) = split_xml($_);
 46 |   my $sentence_start = 1;
 47 |   for(my $i=0;$i<=$#$WORD;$i++) {
 48 |     print " " if $i && $$MARKUP[$i] eq '';
 49 |     print $$MARKUP[$i];
 50 | 
 51 |     my ($word,$otherfactors);
 52 |     if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
 53 |     {
 54 | 	$word = $1;
 55 | 	$otherfactors = $2;
 56 |     }
 57 |     else
 58 |     {
 59 | 	$word = $$WORD[$i];
 60 | 	$otherfactors = "";
 61 |     }
 62 |     if ($asr){
 63 |       $word = lc($word); #make sure ASR output is not uc
 64 |     }
 65 | 
 66 |     if ($sentence_start && defined($BEST{lc($word)})) {
 67 |       print $BEST{lc($word)}; # truecase sentence start
 68 |     }
 69 |     elsif (defined($KNOWN{$word})) {
 70 |       print $word; # don't change known words
 71 |     }
 72 |     elsif (defined($BEST{lc($word)})) {
 73 |       print $BEST{lc($word)}; # truecase otherwise unknown words
 74 |     }
 75 |     else {
 76 |       print $word; # unknown, nothing to do
 77 |     }
 78 |     print $otherfactors;
 79 | 
 80 |     if    ( defined($SENTENCE_END{ $word }))           { $sentence_start = 1; }
 81 |     elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
 82 |   }
 83 |   print $$MARKUP[$#$MARKUP];
 84 |   print "\n";
 85 | }
 86 | 
 87 | # store away xml markup
 88 | sub split_xml {
 89 |   my ($line) = @_;
 90 |   my (@WORD,@MARKUP);
 91 |   my $i = 0;
 92 |   $MARKUP[0] = "";
 93 |   while($line =~ /\S/) {
 94 |     # XML tag
 95 |     if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
 96 |       my $potential_xml = $1;
 97 |       my $line_next = $2;
 98 |       # exception for factor that is an XML tag
 99 |       if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
100 | 	$WORD[$i-1] .= $potential_xml;
101 | 	if ($line_next =~ /^(\|+)(.*)$/) {
102 | 	  $WORD[$i-1] .= $1;
103 | 	  $line_next = $2;
104 | 	}
105 |       }
106 |       else {
107 |         $MARKUP[$i] .= $potential_xml." ";
108 |       }
109 |       $line = $line_next;
110 |     }
111 |     # non-XML text
112 |     elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
113 |       $WORD[$i++] = $1;
114 |       $MARKUP[$i] = "";
115 |       $line = $2;
116 |     }
117 |     # '<' or '>' occurs in word, but it's not an XML tag
118 |     elsif ($line =~ /^\s*(\S+)(.*)$/) {
119 |       $WORD[$i++] = $1;
120 |       $MARKUP[$i] = "";
121 |       $line = $2;
122 |       }
123 |     else {
124 |       die("ERROR: huh? $line\n");
125 |     }
126 |   }
127 |   chop($MARKUP[$#MARKUP]);
128 |   return (\@WORD,\@MARKUP);
129 | }
130 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/programs/ChineseSpecialRemover.java:
--------------------------------------------------------------------------------
  1 | import java.io.*;
  2 | 
  3 | public class ChineseSpecialRemover {
  4 | 
  5 |     public static boolean isCommonWord(String word) {
  6 |         for (int i = 0; i < word.length(); ++i) {
  7 |             char c;
  8 |             if (i == 0) {
  9 |                 c = word.toLowerCase().charAt(0);
 10 |             } else {
 11 |                 c = word.charAt(i);
 12 |             }
 13 |             if (c <= 'z' && c >= 'a') {
 14 |                 continue;
 15 |             } else {
 16 |                 return false;
 17 |             }
 18 |         }
 19 |         return true;
 20 |     }
 21 | 
 22 |     public static boolean isNumber(String word) {
 23 |         try {
 24 |             Double.parseDouble(word);
 25 |         } catch (Exception e) {
 26 |             return false;
 27 |         }
 28 |         return true;
 29 |     }
 30 | 
 31 |     public static boolean isUglySentence(String sentence) {
 32 |         String[] tokens = sentence.trim().split(" ");
 33 |         int cnt = 0;
 34 |         for (String word : tokens) {
 35 |             if (!isCommonWord(word)) ++cnt;
 36 |         }
 37 |         if ((double) cnt / (double) tokens.length >=0.5) {
 38 |             return true;
 39 |         }
 40 |         return false;
 41 |     }
 42 | 
 43 |     public static boolean isAlmostAscii(String sentence) {
 44 |         String[] tokens = sentence.trim().split(" ");
 45 |         int cnt = 0;
 46 |         for (String word : tokens) {
 47 |             if (isNumber(word) || isCommonWord(word)) ++cnt;
 48 |         }
 49 |         if ((double) cnt / (double) tokens.length >= 0.4) {
 50 |             return true;
 51 |         }
 52 |         return false;
 53 |     }
 54 | 
 55 |     public static void main(String[] args) throws Exception {
 56 |         if (args.length < 8) {
 57 |             System.out.println("Usage: java LenRatioRemover zh zh_char en zhchar_div_en_max zhchar_div_en_min zh_output en_output removed_to_file");
 58 |             return;
 59 |         }
 60 |         String srcFile = args[0];
 61 |         String srcCharFile = args[1];
 62 |         String trgFile = args[2];
 63 |         double maxRatio = Double.parseDouble(args[3]);
 64 |         double minRatio = Double.parseDouble(args[4]);
 65 |         String srcOutputFile = args[5];
 66 |         String trgOutputFile = args[6];
 67 |         String removedOutputFile = args[7];
 68 | 
 69 |         BufferedReader brSrc = new BufferedReader(new InputStreamReader(new FileInputStream(srcFile), "utf-8"));
 70 |         BufferedReader brSrcChar = new BufferedReader(new InputStreamReader(new FileInputStream(srcCharFile), "utf-8"));
 71 |         BufferedReader brTrg = new BufferedReader(new InputStreamReader(new FileInputStream(trgFile), "utf-8"));
 72 |         BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcOutputFile), "utf-8"));
 73 |         BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgOutputFile), "utf-8"));
 74 |         BufferedWriter bwRem = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(removedOutputFile), "utf-8"));
 75 | 
 76 |         String zh, zhChar, en;
 77 | 
 78 |         while ((zh = brSrc.readLine()) != null) {
 79 |             zhChar = brSrcChar.readLine();
 80 |             en = brTrg.readLine();
 81 |             double val = (double) zhChar.trim().split(" ").length / (double) en.trim().split(" ").length;
 82 |             if (val <= minRatio || val >= maxRatio) {
 83 |                 bwRem.write("len" + " ||| " + zh + " ||| " + en + "\n");
 84 |             } else {
 85 |                 if(isUglySentence(en) || isAlmostAscii(zh)){
 86 |                     bwRem.write("ascii" + " ||| " + zh + " ||| " + en + "\n");
 87 |                 }else {
 88 |                     bwSrc.write(zh + "\n");
 89 |                     bwTrg.write(en + "\n");
 90 |                 }
 91 |             }
 92 |         }
 93 | 
 94 | 
 95 |         brSrc.close();
 96 |         brSrcChar.close();
 97 |         brTrg.close();
 98 |         bwSrc.close();
 99 |         bwTrg.close();
100 |         bwRem.close();
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/bpe/learn_joint_bpe_and_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
  6 | This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus,
  7 | applies the learned operation to each and (optionally) returns the resulting vocabulary of each text.
  8 | The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text.
  9 | 
 10 | Reference:
 11 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
 12 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 13 | """
 14 | 
 15 | from __future__ import unicode_literals
 16 | 
 17 | import sys
 18 | import os
 19 | import codecs
 20 | import argparse
 21 | import tempfile
 22 | from collections import Counter
 23 | 
 24 | import learn_bpe
 25 | import apply_bpe
 26 | 
 27 | # hack for python2/3 compatibility
 28 | from io import open
 29 | argparse.open = open
 30 | 
 31 | def create_parser():
 32 |     parser = argparse.ArgumentParser(
 33 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 34 |         description="learn BPE-based word segmentation")
 35 | 
 36 |     parser.add_argument(
 37 |         '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+',
 38 |         metavar='PATH',
 39 |         help="Input texts (multiple allowed).")
 40 |     parser.add_argument(
 41 |         '--output', '-o', type=argparse.FileType('w'), required=True,
 42 |         metavar='PATH',
 43 |         help="Output file for BPE codes.")
 44 |     parser.add_argument(
 45 |         '--symbols', '-s', type=int, default=10000,
 46 |         help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
 47 |     parser.add_argument(
 48 |         '--separator', type=str, default='@@', metavar='STR',
 49 |         help="Separator between non-final subword units (default: '%(default)s'))")
 50 |     parser.add_argument(
 51 |         '--write-vocabulary', type=argparse.FileType('w'), nargs = '+', default=None,
 52 |         metavar='PATH', dest='vocab',
 53 |         help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py')
 54 |     parser.add_argument(
 55 |         '--min-frequency', type=int, default=2, metavar='FREQ',
 56 |         help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
 57 |     parser.add_argument(
 58 |         '--verbose', '-v', action="store_true",
 59 |         help="verbose mode.")
 60 | 
 61 |     return parser
 62 | 
 63 | 
 64 | 
 65 | if __name__ == '__main__':
 66 | 
 67 |     # python 2/3 compatibility
 68 |     if sys.version_info < (3, 0):
 69 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 70 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 71 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 72 |     else:
 73 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
 74 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
 75 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
 76 | 
 77 |     parser = create_parser()
 78 |     args = parser.parse_args()
 79 | 
 80 |     if args.vocab and len(args.input) != len(args.vocab):
 81 |         sys.stderr.write('Error: number of input files and vocabulary files must match\n')
 82 |         sys.exit(1)
 83 | 
 84 |     # read/write files as UTF-8
 85 |     args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
 86 |     args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]
 87 | 
 88 |     # get combined vocabulary of all input texts
 89 |     full_vocab = Counter()
 90 |     for f in args.input:
 91 |         full_vocab += learn_bpe.get_vocabulary(f)
 92 |         f.seek(0)
 93 | 
 94 |     vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]
 95 | 
 96 |     # learn BPE on combined vocabulary
 97 |     with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
 98 |         learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)
 99 | 
100 |     with codecs.open(args.output.name, encoding='UTF-8') as codes:
101 |         bpe = apply_bpe.BPE(codes, separator=args.separator)
102 | 
103 |     # apply BPE to each training corpus and get vocabulary
104 |     for train_file, vocab_file in zip(args.input, args.vocab):
105 | 
106 |         tmp = tempfile.NamedTemporaryFile(delete=False)
107 |         tmp.close()
108 | 
109 |         tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
110 | 
111 |         train_file.seek(0)
112 |         for line in train_file:
113 |             tmpout.write(bpe.segment(line).strip())
114 |             tmpout.write('\n')
115 | 
116 |         tmpout.close()
117 |         tmpin = codecs.open(tmp.name, encoding='UTF-8')
118 | 
119 |         vocab = learn_bpe.get_vocabulary(tmpin)
120 |         tmpin.close()
121 |         os.remove(tmp.name)
122 | 
123 |         for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
124 |             vocab_file.write("{0} {1}\n".format(key, freq))
125 |         vocab_file.close()
126 | 


--------------------------------------------------------------------------------
/fetch_wmt2018_zhen.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Copyright 2017 Natural Language Processing Group, Nanjing University, zhaocq.nlp@gmail.com.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | set -e
 17 | 
 18 | REPO_DIR=.
 19 | 
 20 | OUTPUT_DIR="${1:-wmt18_zh_en}"
 21 | 
 22 | MERGE_OPS=60000
 23 | BPE_THRESHOLD=50
 24 | 
 25 | echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable."
 26 | 
 27 | OUTPUT_DIR_DATA="${OUTPUT_DIR}/data"
 28 | mkdir -p $OUTPUT_DIR_DATA
 29 | 
 30 | echo "Downloading preprocessed data. This may take a while..."
 31 | curl -o ${OUTPUT_DIR_DATA}/corpus.gz \
 32 |     http://data.statmt.org/wmt18/translation-task/preprocessed/zh-en/corpus.gz
 33 | 
 34 | echo "Downloading preprocessed dev data..."
 35 | curl -o ${OUTPUT_DIR_DATA}/dev.tgz \
 36 |     http://data.statmt.org/wmt18/translation-task/preprocessed/zh-en/dev.tgz
 37 | 
 38 | echo "Downloading test data..."
 39 | # curl -o ${OUTPUT_DIR_DATA}/test.tgz \
 40 | #    http://data.statmt.org/wmt18/translation-task/test.tgz
 41 | 
 42 | echo "Extracting all files..."
 43 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.de.gz
 44 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.en.gz
 45 | mkdir -p "${OUTPUT_DIR_DATA}/dev"
 46 | tar -zxvf ${OUTPUT_DIR_DATA}/dev.tgz -C "${OUTPUT_DIR_DATA}/dev"
 47 | #tar -zxvf ${OUTPUT_DIR_DATA}/test.tgz -C "${OUTPUT_DIR_DATA}/"
 48 | 
 49 | mkdir ${OUTPUT_DIR}/dev
 50 | cp ${OUTPUT_DIR_DATA}/dev/newsdev2017-zhen* ${OUTPUT_DIR}/dev/
 51 | cp ${OUTPUT_DIR_DATA}/dev/newsdev2017-enzh* ${OUTPUT_DIR}/dev/
 52 | cp ${OUTPUT_DIR_DATA}/dev/newstest2017-zhen* ${OUTPUT_DIR}/dev/
 53 | cp ${OUTPUT_DIR_DATA}/dev/newstest2017-enzh* ${OUTPUT_DIR}/dev/
 54 | 
 55 | cp ${REPO_DIR}/programs/SplitChineseFile.class .
 56 | java SplitChineseFile ${OUTPUT_DIR_DATA}/corpus ${OUTPUT_DIR_DATA}/corpus.zh ${OUTPUT_DIR_DATA}/corpus.en
 57 | rm ./SplitChineseFile.class
 58 | 
 59 | # recover special fields
 60 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/corpus.en > ${OUTPUT_DIR}/train.tok.tc.en
 61 | sed 's/& amp ;/\&/g' ${OUTPUT_DIR}/train.tok.tc.en > ${OUTPUT_DIR}/train.tok.tc.en.tmp
 62 | sed 's/& lt ;/\</g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp
 63 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp
 64 | sed 's/& gt ;/\>/g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp
 65 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp
 66 | sed 's/& quot ;/\"/g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp
 67 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp
 68 | sed "s/& apos ; s /\'s /g" ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp
 69 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp
 70 | sed "s/& apos ;/\'/g" ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp
 71 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp
 72 | sed 's/& amp ;/\&/g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en
 73 | rm ${OUTPUT_DIR}/train.tok.tc.en.tmp
 74 | 
 75 | # use newsdev2017 as dev set
 76 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newsdev2017.tc.en > ${OUTPUT_DIR}/newsdev2017.tok.tc.en
 77 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newstest2017.tc.en > ${OUTPUT_DIR}/newstest2017.tok.tc.en
 78 | 
 79 | cp ${REPO_DIR}/programs/CleanChineseFile.class .
 80 | java CleanChineseFile ${OUTPUT_DIR_DATA}/corpus.zh ${OUTPUT_DIR}/train.tok.zh
 81 | java CleanChineseFile ${OUTPUT_DIR_DATA}/dev/newsdev2017.tc.zh ${OUTPUT_DIR}/newsdev2017.tok.zh
 82 | java CleanChineseFile ${OUTPUT_DIR_DATA}/dev/newstest2017.tc.zh ${OUTPUT_DIR}/newstest2017.tok.zh
 83 | rm ./CleanChineseFile.class
 84 | 
 85 | rm ${OUTPUT_DIR_DATA}/corpus*
 86 | python ${REPO_DIR}/scripts/tokenizeChinese.py ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.zh.char
 87 | 
 88 | echo "Removing special sentences..."
 89 | cp ${REPO_DIR}/programs/ChineseSpecialRemover.class .
 90 | java ChineseSpecialRemover ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.zh.char ${OUTPUT_DIR}/train.tok.tc.en 3.0 0.7 ${OUTPUT_DIR}/train.tok.zh.rm ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR_DATA}/train.special.removed
 91 | rm ./ChineseSpecialRemover.class ${OUTPUT_DIR}/train.tok.zh.char
 92 | mv ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR_DATA}/train.tok.zh
 93 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en
 94 | mv ${OUTPUT_DIR}/train.tok.zh.rm ${OUTPUT_DIR}/train.tok.zh
 95 | mv ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.en
 96 | 
 97 | # merge
 98 | cp ${REPO_DIR}/programs/MergeAndSplit.class ./
 99 | java MergeAndSplit merge ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/merged
100 | echo "Sorting and removing duplicated sentences..."
101 | sort -u ${OUTPUT_DIR}/merged > ${OUTPUT_DIR}/merged.sort
102 | 
103 | java MergeAndSplit split ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/merged.sort
104 | rm ${OUTPUT_DIR}/merged.sort ./MergeAndSplit.class ${OUTPUT_DIR}/merged
105 | 
106 | # the files are already cleaned, we only need to learn BPE
107 | echo "Learning BPE with merge_ops=${MERGE_OPS}. This may take a while..."
108 | ${REPO_DIR}/bpe/learn_joint_bpe_and_vocab.py -i ${OUTPUT_DIR}/train.tok.zh \
109 |     --write-vocabulary ${OUTPUT_DIR}/vocab.zh -s ${MERGE_OPS} -o ${OUTPUT_DIR}/bpe.${MERGE_OPS}.zh
110 | ${REPO_DIR}/bpe/learn_joint_bpe_and_vocab.py -i ${OUTPUT_DIR}/train.tok.tc.en \
111 |     --write-vocabulary ${OUTPUT_DIR}/vocab.en -s ${MERGE_OPS} -o ${OUTPUT_DIR}/bpe.${MERGE_OPS}.en
112 | 
113 | echo "Apply bpe..."
114 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS}.en --vocabulary ${OUTPUT_DIR}/vocab.en --vocabulary-threshold ${BPE_THRESHOLD} \
115 |     --input ${OUTPUT_DIR}/train.tok.tc.en --output ${OUTPUT_DIR}/train.tok.tc.bpe90k.en
116 | 
117 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS}.zh --vocabulary ${OUTPUT_DIR}/vocab.zh --vocabulary-threshold ${BPE_THRESHOLD} \
118 |     --input ${OUTPUT_DIR}/train.tok.zh --output ${OUTPUT_DIR}/train.tok.bpe90k.zh
119 | 
120 | echo "Generate vocabulary..."
121 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.bpe60k.zh > ${OUTPUT_DIR}/vocab.bpe60k.all.zh
122 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.tc.bpe60k.en > ${OUTPUT_DIR}/vocab.bpe60k.all.en
123 | 
124 | echo "shuffling data..."
125 | python ${REPO_DIR}/scripts/shuffle.py ${OUTPUT_DIR}/train.tok.zh,${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.zh.shuf,${OUTPUT_DIR}/train.tok.tc.en.shuf
126 | 
127 | rm -r ${OUTPUT_DIR_DATA}
128 | 
129 | echo "All done."


--------------------------------------------------------------------------------
/scripts/tokenizeChinese.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Natural Language Processing Group, Nanjing University, zhaocq.nlp@gmail.com.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """ The tokenization of Chinese text contains two steps: separate each Chinese characters (by utf-8 encoding); tokenize the non Chinese part (following the mteval script).
 15 | Refer to https://github.com/NJUNLP/ZhTokenizer
 16 | """
 17 | import re
 18 | import sys
 19 | import codecs
 20 | import six
 21 | 
 22 | 
 23 | def is_chinese_char(uchar):
 24 |     """ Whether is a chinese character.
 25 | 
 26 |     Args:
 27 |         uchar: A utf-8 char.
 28 | 
 29 |     Returns: True/False.
 30 |     """
 31 |     if uchar >= u'\u3400' and uchar <= u'\u4db5':  # CJK Unified Ideographs Extension A, release 3.0
 32 |         return True
 33 |     elif uchar >= u'\u4e00' and uchar <= u'\u9fa5':  # CJK Unified Ideographs, release 1.1
 34 |         return True
 35 |     elif uchar >= u'\u9fa6' and uchar <= u'\u9fbb':  # CJK Unified Ideographs, release 4.1
 36 |         return True
 37 |     elif uchar >= u'\uf900' and uchar <= u'\ufa2d':  # CJK Compatibility Ideographs, release 1.1
 38 |         return True
 39 |     elif uchar >= u'\ufa30' and uchar <= u'\ufa6a':  # CJK Compatibility Ideographs, release 3.2
 40 |         return True
 41 |     elif uchar >= u'\ufa70' and uchar <= u'\ufad9':  # CJK Compatibility Ideographs, release 4.1
 42 |         return True
 43 |     elif uchar >= u'\u20000' and uchar <= u'\u2a6d6':  # CJK Unified Ideographs Extension B, release 3.1
 44 |         return True
 45 |     elif uchar >= u'\u2f800' and uchar <= u'\u2fa1d':  # CJK Compatibility Supplement, release 3.1
 46 |         return True
 47 |     elif uchar >= u'\uff00' and uchar <= u'\uffef':  # Full width ASCII, full width of English punctuation, half width Katakana, half wide half width kana, Korean alphabet
 48 |         return True
 49 |     elif uchar >= u'\u2e80' and uchar <= u'\u2eff':  # CJK Radicals Supplement
 50 |         return True
 51 |     elif uchar >= u'\u3000' and uchar <= u'\u303f':  # CJK punctuation mark
 52 |         return True
 53 |     elif uchar >= u'\u31c0' and uchar <= u'\u31ef':  # CJK stroke
 54 |         return True
 55 |     elif uchar >= u'\u2f00' and uchar <= u'\u2fdf':  # Kangxi Radicals
 56 |         return True
 57 |     elif uchar >= u'\u2ff0' and uchar <= u'\u2fff':  # Chinese character structure
 58 |         return True
 59 |     elif uchar >= u'\u3100' and uchar <= u'\u312f':  # Phonetic symbols
 60 |         return True
 61 |     elif uchar >= u'\u31a0' and uchar <= u'\u31bf':  # Phonetic symbols (Taiwanese and Hakka expansion)
 62 |         return True
 63 |     elif uchar >= u'\ufe10' and uchar <= u'\ufe1f':
 64 |         return True
 65 |     elif uchar >= u'\ufe30' and uchar <= u'\ufe4f':
 66 |         return True
 67 |     elif uchar >= u'\u2600' and uchar <= u'\u26ff':
 68 |         return True
 69 |     elif uchar >= u'\u2700' and uchar <= u'\u27bf':
 70 |         return True
 71 |     elif uchar >= u'\u3200' and uchar <= u'\u32ff':
 72 |         return True
 73 |     elif uchar >= u'\u3300' and uchar <= u'\u33ff':
 74 |         return True
 75 |     else:
 76 |         return False
 77 | 
 78 | 
 79 | def to_chinese_char(sentences):
 80 |     """ Converts a Chinese sentence to character level.
 81 | 
 82 |     Args:
 83 |         sentences: A utf-8 string or a list of utf-8 strings.
 84 | 
 85 |     Returns: A utf-8 string or a list of utf-8 strings.
 86 |     """
 87 | 
 88 |     def process(sentence):
 89 |         sentence = sentence.strip()
 90 | 
 91 |         sentence_in_chars = ""
 92 |         for c in sentence:
 93 |             if is_chinese_char(c):
 94 |                 sentence_in_chars += " "
 95 |                 sentence_in_chars += c
 96 |                 sentence_in_chars += " "
 97 |             else:
 98 |                 sentence_in_chars += c
 99 |         sentence = sentence_in_chars
100 | 
101 |         # tokenize punctuation
102 |         sentence = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', r' \1 ', sentence)
103 | 
104 |         # tokenize period and comma unless preceded by a digit
105 |         sentence = re.sub(r'([^0-9])([\.,])', r'\1 \2 ', sentence)
106 | 
107 |         # tokenize period and comma unless followed by a digit
108 |         sentence = re.sub(r'([\.,])([^0-9])', r' \1 \2', sentence)
109 | 
110 |         # tokenize dash when preceded by a digit
111 |         sentence = re.sub(r'([0-9])(-)', r'\1 \2 ', sentence)
112 | 
113 |         # one space only between words
114 |         sentence = re.sub(r'\s+', r' ', sentence)
115 | 
116 |         # no leading space
117 |         sentence = re.sub(r'^\s+', r'', sentence)
118 | 
119 |         # no trailing space
120 |         sentence = re.sub(r'\s+$', r'', sentence)
121 |         return sentence
122 | 
123 |     if isinstance(sentences, list):
124 |         return [process(s) for s in sentences]
125 |     elif isinstance(sentences, six.string_types):
126 |         return process(sentences)
127 |     else:
128 |         raise ValueError
129 | 
130 | 
131 | def tokenize_sgm_file(input_xml_file, output_xml_file):
132 |     """ Converts Chinese sentence from input file to output file (XML file).
133 | 
134 |     Args:
135 |         input_xml_file: A string.
136 |         output_xml_file: A string.
137 |     """
138 |     file_r = codecs.open(input_xml_file, 'r', encoding="utf-8")  # input file
139 |     file_w = codecs.open(output_xml_file, 'w', encoding="utf-8")  # result file
140 | 
141 |     for sentence in file_r:
142 |         if sentence.startswith("<seg"):
143 |             start = sentence.find(">") + 1
144 |             end = sentence.rfind("<")
145 |             new_sentence = sentence[:start] + to_chinese_char(sentence[start:end]) + sentence[end:]
146 |         else:
147 |             new_sentence = sentence
148 |         file_w.write(new_sentence)
149 | 
150 |     file_r.close()
151 |     file_w.close()
152 | 
153 | 
154 | def tokenize_plain_file(input_file, output_file):
155 |     """ Converts Chinese sentence from input file to output file (plain text file).
156 | 
157 |     Args:
158 |         input_file: A string.
159 |         output_file: A string.
160 |     """
161 |     file_r = codecs.open(input_file, 'r', encoding="utf-8")  # input file
162 |     file_w = codecs.open(output_file, 'w', encoding="utf-8")  # result file
163 | 
164 |     for sentence in file_r:
165 |         file_w.write(to_chinese_char(sentence) + "\n")
166 | 
167 |     file_r.close()
168 |     file_w.close()
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     if sys.argv[1].endswith(".sgm"):
173 |         tokenize_sgm_file(sys.argv[1], sys.argv[2])
174 |     else:
175 |         tokenize_plain_file(sys.argv[1], sys.argv[2])
176 | 


--------------------------------------------------------------------------------
/fetch_wmt2017_ende.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Copyright 2017 Natural Language Processing Group, Nanjing University, zhaocq.nlp@gmail.com.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | set -e
 17 | 
 18 | REPO_DIR=.
 19 | 
 20 | OUTPUT_DIR="${1:-wmt17_de_en}"
 21 | 
 22 | MERGE_OPS=90000
 23 | BPE_THRESHOLD=50
 24 | 
 25 | echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable."
 26 | 
 27 | OUTPUT_DIR_DATA="${OUTPUT_DIR}/data"
 28 | mkdir -p $OUTPUT_DIR_DATA
 29 | mkdir ${OUTPUT_DIR}/dev
 30 | mkdir ${OUTPUT_DIR}/test
 31 | 
 32 | echo "Downloading preprocessed data. This may take a while..."
 33 | 
 34 | curl -o ${OUTPUT_DIR_DATA}/corpus.tc.de.gz \
 35 |     http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.de.gz
 36 | 
 37 | curl -o ${OUTPUT_DIR_DATA}/corpus.tc.en.gz \
 38 |     http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.en.gz
 39 | 
 40 | echo "Downloading preprocessed dev data..."
 41 | curl -o ${OUTPUT_DIR_DATA}/dev.tgz \
 42 |     http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/dev.tgz
 43 | 
 44 | echo "Downloading test data..."
 45 | curl -o ${OUTPUT_DIR_DATA}/test.tgz \
 46 |     http://data.statmt.org/wmt17/translation-task/test.tgz
 47 | 
 48 | echo "Downloading truecase model..."
 49 | curl -o ${OUTPUT_DIR_DATA}/true.tgz \
 50 |     http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/true.tgz
 51 | 
 52 | echo "Extracting all files..."
 53 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.de.gz
 54 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.en.gz
 55 | mkdir -p "${OUTPUT_DIR_DATA}/dev"
 56 | tar -zxvf ${OUTPUT_DIR_DATA}/dev.tgz -C "${OUTPUT_DIR_DATA}/dev"
 57 | tar -zxvf ${OUTPUT_DIR_DATA}/test.tgz -C "${OUTPUT_DIR_DATA}/"
 58 | tar -zxvf ${OUTPUT_DIR_DATA}/true.tgz -C "${OUTPUT_DIR_DATA}/"
 59 | 
 60 | # recover special fields
 61 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/corpus.tc.de > ${OUTPUT_DIR}/train.tok.tc.de
 62 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/corpus.tc.en > ${OUTPUT_DIR}/train.tok.tc.en
 63 | 
 64 | # use newstest2016 as dev set
 65 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newstest2016.tc.en > ${OUTPUT_DIR}/dev.tok.tc.en
 66 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newstest2016.tc.de > ${OUTPUT_DIR}/dev.tok.tc.de
 67 | 
 68 | # Convert newstest2017 data into raw text format
 69 | ${REPO_DIR}/scripts/input-from-sgm.perl \
 70 |   < ${OUTPUT_DIR_DATA}/test/newstest2017-deen-src.de.sgm \
 71 |   > ${OUTPUT_DIR_DATA}/test/newstest2017.deen.de
 72 | ${REPO_DIR}/scripts/input-from-sgm.perl \
 73 |   < ${OUTPUT_DIR_DATA}/test/newstest2017-deen-ref.en.sgm \
 74 |   > ${OUTPUT_DIR_DATA}/test/newstest2017.deen.en
 75 | ${REPO_DIR}/scripts/input-from-sgm.perl \
 76 |   < ${OUTPUT_DIR_DATA}/test/newstest2017-ende-src.en.sgm \
 77 |   > ${OUTPUT_DIR_DATA}/test/newstest2017.ende.en
 78 | ${REPO_DIR}/scripts/input-from-sgm.perl \
 79 |   < ${OUTPUT_DIR_DATA}/test/newstest2017-ende-ref.de.sgm \
 80 |   > ${OUTPUT_DIR_DATA}/test/newstest2017.ende.de
 81 | 
 82 | cp ${OUTPUT_DIR_DATA}/dev/newstest2016-deen* ${OUTPUT_DIR}/dev/
 83 | cp ${OUTPUT_DIR_DATA}/dev/newstest2016-ende* ${OUTPUT_DIR}/dev/
 84 | cp ${OUTPUT_DIR_DATA}/test/newstest2017-deen* ${OUTPUT_DIR}/test/
 85 | cp ${OUTPUT_DIR_DATA}/test/newstest2017-ende* ${OUTPUT_DIR}/test/
 86 | 
 87 | # tokenize
 88 | echo "Tokenize..."
 89 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.deen.de | \
 90 |    ${REPO_DIR}/scripts/normalize-punctuation.perl -l de | \
 91 |    ${REPO_DIR}/scripts/tokenizer.perl -a -q -l de -no-escape | \
 92 |    ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.de > ${OUTPUT_DIR}/newstest2017.deen.tok.tc.de
 93 | 
 94 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.deen.en | \
 95 |    ${REPO_DIR}/scripts/normalize-punctuation.perl -l en | \
 96 |    ${REPO_DIR}/scripts/tokenizer.perl -a -q -l en -no-escape | \
 97 |    ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.en > ${OUTPUT_DIR}/newstest2017.deen.tok.tc.en
 98 | 
 99 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.ende.de | \
100 |    ${REPO_DIR}/scripts/normalize-punctuation.perl -l de | \
101 |    ${REPO_DIR}/scripts/tokenizer.perl -a -q -l de -no-escape | \
102 |    ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.de > ${OUTPUT_DIR}/newstest2017.ende.tok.tc.de
103 | 
104 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.ende.en | \
105 |    ${REPO_DIR}/scripts/normalize-punctuation.perl -l en | \
106 |    ${REPO_DIR}/scripts/tokenizer.perl -a -q -l en -no-escape | \
107 |    ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.en > ${OUTPUT_DIR}/newstest2017.ende.tok.tc.en
108 | 
109 | # filter by length ratio
110 | echo "Filtering by sentence length ratio..."
111 | cp ${REPO_DIR}/programs/LenRatioRemover.class .
112 | java LenRatioRemover ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de 2.0 0.4 ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR_DATA}/train.lenratio.removed
113 | mv ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR_DATA}/train.tok.tc.de
114 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en
115 | mv ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR}/train.tok.tc.de
116 | mv ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.en
117 | rm ./LenRatioRemover.class
118 | 
119 | # filter ugly sentences
120 | echo "Filtering ugly sentences..."
121 | cp ${REPO_DIR}/programs/SpecialSentRemoverENDE.class .
122 | java SpecialSentRemoverENDE ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR_DATA}/train.special.removed
123 | mv ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR_DATA}/train.tok.tc.de.lenrm
124 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en.lenrm
125 | mv ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR}/train.tok.tc.de
126 | mv ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.en
127 | rm ./SpecialSentRemoverENDE.class
128 | 
129 | cp ${REPO_DIR}/programs/MergeAndSplit.class ./
130 | java MergeAndSplit merge ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/merged
131 | mv ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR_DATA}/train.tok.tc.de.sprm
132 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en.sprm
133 | echo "Sorting and removing duplicated sentences..."
134 | sort -u ${OUTPUT_DIR}/merged > ${OUTPUT_DIR}/merged.sort
135 | 
136 | java MergeAndSplit split ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/merged.sort
137 | rm ${OUTPUT_DIR}/merged.sort ./MergeAndSplit.class ${OUTPUT_DIR}/merged
138 | 
139 | # the files are already cleaned, we only need to learn BPE
140 | echo "Learning BPE with merge_ops=${MERGE_OPS}. This may take a while..."
141 | ${REPO_DIR}/bpe/learn_joint_bpe_and_vocab.py -i ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/train.tok.tc.en \
142 |     --write-vocabulary ${OUTPUT_DIR}/vocab.de ${OUTPUT_DIR}/vocab.en -s ${MERGE_OPS} -o ${OUTPUT_DIR}/bpe.${MERGE_OPS}
143 | 
144 | echo "Apply bpe..."
145 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS} --vocabulary ${OUTPUT_DIR}/vocab.de --vocabulary-threshold ${BPE_THRESHOLD} \
146 |     --input ${OUTPUT_DIR}/train.tok.tc.de --output ${OUTPUT_DIR}/train.tok.tc.bpe90k.de
147 | 
148 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS} --vocabulary ${OUTPUT_DIR}/vocab.en --vocabulary-threshold ${BPE_THRESHOLD} \
149 |     --input ${OUTPUT_DIR}/train.tok.tc.en --output ${OUTPUT_DIR}/train.tok.tc.bpe90k.en
150 | 
151 | echo "Generate vocabulary..."
152 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.tc.bpe90k.de > ${OUTPUT_DIR}/vocab.bpe90k.all.de
153 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.tc.bpe90k.en > ${OUTPUT_DIR}/vocab.bpe90k.all.en
154 | 
155 | echo "shuffling data..."
156 | python ${REPO_DIR}/scripts/shuffle.py ${OUTPUT_DIR}/train.tok.tc.de,${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de.shuf,${OUTPUT_DIR}/train.tok.tc.en.shuf
157 | 
158 | rm -r ${OUTPUT_DIR_DATA}
159 | 
160 | echo "All done."


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.lt:
--------------------------------------------------------------------------------
  1 | # Anything in this file, followed by a period (and an upper-case word),
  2 | # does NOT indicate an end-of-sentence marker.
  3 | # Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | # Any single upper case letter  followed by a period is not a sentence ender
  6 | # (excluding I occasionally, but we leave it in)
  7 | # usually upper case letters are initials in a name
  8 | A
  9 | Ā
 10 | B
 11 | C
 12 | Č
 13 | D
 14 | E
 15 | Ē
 16 | F
 17 | G
 18 | Ģ
 19 | H
 20 | I
 21 | Ī
 22 | J
 23 | K
 24 | Ķ
 25 | L
 26 | Ļ
 27 | M
 28 | N
 29 | Ņ
 30 | O
 31 | P
 32 | Q
 33 | R
 34 | S
 35 | Š
 36 | T
 37 | U
 38 | Ū
 39 | V
 40 | W
 41 | X
 42 | Y
 43 | Z
 44 | Ž
 45 | 
 46 | # Initialis -- Džonas
 47 | Dz
 48 | Dž
 49 | Just
 50 | 
 51 | # Day and month abbreviations
 52 | # m. menesis d. diena  g. gimes
 53 | m
 54 | mėn
 55 | d
 56 | g
 57 | gim
 58 | # Pirmadienis Penktadienis
 59 | Pr
 60 | Pn
 61 | Pirm
 62 | Antr
 63 | Treč
 64 | Ketv
 65 | Penkt
 66 | Šešt
 67 | Sekm
 68 | Saus
 69 | Vas
 70 | Kov
 71 | Bal
 72 | Geg
 73 | Birž
 74 | Liep
 75 | Rugpj
 76 | Rugs
 77 | Spal
 78 | Lapkr
 79 | Gruod
 80 | 
 81 | # Business, governmental, geographical terms
 82 | a
 83 | # aikštė
 84 | adv
 85 | # advokatas
 86 | akad
 87 | # akademikas
 88 | aklg
 89 | # akligatvis
 90 | akt
 91 | # aktorius
 92 | al
 93 | # alėja
 94 | A.V
 95 | # antspaudo vieta
 96 | aps
 97 | apskr
 98 | # apskritis
 99 | apyg
100 | # apygarda
101 | aps
102 | apskr
103 | # apskritis
104 | asist
105 | # asistentas
106 | asmv
107 | avd
108 | # asmenvardis
109 | a.k
110 | asm
111 | asm.k
112 | # asmens kodas
113 | atsak
114 | # atsakingasis
115 | atsisk
116 | sąsk
117 | # atsiskaitomoji sąskaita
118 | aut
119 | # autorius
120 | b
121 | k
122 | b.k
123 | # banko kodas
124 | bkl
125 | # bakalauras
126 | bt
127 | # butas
128 | buv
129 | # buvęs, -usi
130 | dail
131 | # dailininkas
132 | dek
133 | # dekanas
134 | dėst
135 | # dėstytojas
136 | dir
137 | # direktorius
138 | dirig
139 | # dirigentas
140 | doc
141 | # docentas
142 | drp
143 | # durpynas
144 | dš
145 | # dešinysis
146 | egz
147 | # egzempliorius
148 | eil
149 | # eilutė
150 | ekon
151 | # ekonomika
152 | el
153 | # elektroninis
154 | etc
155 | ež
156 | # ežeras
157 | faks
158 | # faksas
159 | fak
160 | # fakultetas
161 | gen
162 | # generolas
163 | gyd
164 | # gydytojas
165 | gv
166 | # gyvenvietė
167 | įl
168 | # įlanka
169 | Įn
170 | # įnagininkas
171 | insp
172 | # inspektorius
173 | pan
174 | # ir panašiai
175 | t.t
176 | # ir taip toliau
177 | k.a
178 | # kaip antai
179 | kand
180 | # kandidatas
181 | kat
182 | # katedra
183 | kyš
184 | # kyšulys
185 | kl
186 | # klasė
187 | kln
188 | # kalnas
189 | kn
190 | # knyga
191 | koresp
192 | # korespondentas
193 | kpt
194 | # kapitonas
195 | kr
196 | # kairysis
197 | kt
198 | # kitas
199 | kun
200 | # kunigas
201 | l
202 | e
203 | p
204 | l.e.p
205 | # laikinai einantis pareigas
206 | ltn
207 | # leitenantas
208 | m
209 | mst
210 | # miestas
211 | m.e
212 | # mūsų eros
213 | m.m
214 | # mokslo metai
215 | mot
216 | # moteris
217 | mstl
218 | # miestelis
219 | mgr
220 | # magistras
221 | mgnt
222 | # magistrantas
223 | mjr
224 | # majoras
225 | mln
226 | # milijonas
227 | mlrd
228 | # milijardas
229 | mok
230 | # mokinys
231 | mokyt
232 | # mokytojas
233 | moksl
234 | # mokslinis
235 | nkt
236 | # nekaitomas
237 | ntk
238 | # neteiktinas
239 | Nr
240 | nr
241 | # numeris
242 | p
243 | # ponas
244 | p.d
245 | a.d
246 | # pašto dėžutė, abonentinė dėžutė
247 | p.m.e
248 | # prieš mūsų erą
249 | pan
250 | # ir panašiai
251 | pav
252 | # paveikslas
253 | pavad
254 | # pavaduotojas
255 | pirm
256 | # pirmininkas
257 | pl
258 | # plentas
259 | plg
260 | # palygink
261 | plk
262 | # pulkininkas; pelkė
263 | pr
264 | # prospektas
265 | Kr
266 | pr.Kr
267 | # prieš Kristų
268 | prok
269 | # prokuroras
270 | prot
271 | # protokolas
272 | pss
273 | # pusiasalis
274 | pšt
275 | # paštas
276 | pvz
277 | # pavyzdžiui
278 | r
279 | # rajonas
280 | red
281 | # redaktorius
282 | rš
283 | # raštų kalbos
284 | sąs
285 | # sąsiuvinis
286 | saviv
287 | sav
288 | # savivaldybė
289 | sekr
290 | # sekretorius
291 | sen
292 | # seniūnija, seniūnas
293 | sk
294 | # skaityk; skyrius
295 | skg
296 | # skersgatvis
297 | skyr
298 | sk
299 | # skyrius
300 | skv
301 | # skveras
302 | sp
303 | # spauda; spaustuvė
304 | spec
305 | # specialistas
306 | sr
307 | # sritis
308 | st
309 | # stotis
310 | str
311 | # straipsnis
312 | stud
313 | # studentas
314 | š
315 | š.m
316 | # šių metų
317 | šnek
318 | # šnekamosios
319 | tir
320 | # tiražas
321 | tūkst
322 | # tūkstantis
323 | up
324 | # upė
325 | upl
326 | # upelis
327 | vad
328 | # vadinamasis, -oji
329 | vlsč
330 | # valsčius
331 | ved
332 | # vedėjas
333 | vet
334 | # veterinarija
335 | virš
336 | # viršininkas, viršaitis
337 | vyr
338 | # vyriausiasis, -ioji; vyras
339 | vyresn
340 | # vyresnysis
341 | vlsč
342 | # valsčius
343 | vs
344 | # viensėdis
345 | Vt
346 | vt
347 | # vietininkas
348 | vtv
349 | vv
350 | # vietovardis
351 | žml
352 | # žemėlapis
353 | 
354 | # Technical terms, abbreviations used in guidebooks, advertisments, etc.
355 | # Generally lower-case.
356 | air
357 | # airiškai
358 | amer
359 | # amerikanizmas
360 | anat
361 | # anatomija
362 | angl
363 | # angl. angliskai
364 | arab
365 | # arabų
366 | archeol
367 | archit
368 | asm
369 | # asmuo
370 | astr
371 | # astronomija
372 | austral
373 | # australiškai
374 | aut
375 | # automobilis
376 | av
377 | # aviacija
378 | bažn
379 | bdv
380 | # būdvardis
381 | bibl
382 | # Biblija
383 | biol
384 | # biologija
385 | bot
386 | # botanika
387 | brt
388 | # burtai, burtažodis.
389 | brus
390 | # baltarusių
391 | buh
392 | # buhalterija
393 | chem
394 | # chemija
395 | col
396 | # collectivum
397 | con
398 | conj
399 | # conjunctivus, jungtukas
400 | dab
401 | # dab. dabartine
402 | dgs
403 | # daugiskaita
404 | dial
405 | # dialektizmas
406 | dipl
407 | dktv
408 | # daiktavardis
409 | džn
410 | # dažnai
411 | ekon
412 | el
413 | # elektra
414 | esam
415 | # esamasis laikas
416 | euf
417 | # eufemizmas
418 | fam
419 | # familiariai
420 | farm
421 | # farmacija
422 | filol
423 | # filologija
424 | filos
425 | # filosofija
426 | fin
427 | # finansai
428 | fiz
429 | # fizika
430 | fiziol
431 | # fiziologija
432 | flk
433 | # folkloras
434 | fon
435 | # fonetika
436 | fot
437 | # fotografija
438 | geod
439 | # geodezija
440 | geogr
441 | geol
442 | # geologija
443 | geom
444 | # geometrija
445 | glžk
446 | gr
447 | # graikų
448 | gram
449 | her
450 | # heraldika
451 | hidr
452 | # hidrotechnika
453 | ind
454 | # Indų
455 | iron
456 | # ironiškai
457 | isp
458 | # ispanų
459 | ist
460 | istor
461 | # istorija
462 | it
463 | # italų
464 | įv
465 | reikšm
466 | įv.reikšm
467 | # įvairiomis reikšmėmis
468 | jap
469 | # japonų
470 | juok
471 | # juokaujamai
472 | jūr
473 | # jūrininkystė
474 | kalb
475 | # kalbotyra
476 | kar
477 | # karyba
478 | kas
479 | # kasyba
480 | kin
481 | # kinematografija
482 | klaus
483 | # klausiamasis
484 | knyg
485 | # knyginis
486 | kom
487 | # komercija
488 | komp
489 | # kompiuteris
490 | kosm
491 | # kosmonautika
492 | kt
493 | # kitas
494 | kul
495 | # kulinarija
496 | kuop
497 | # kuopine
498 | l
499 | # laikas
500 | lit
501 | # literatūrinis
502 | lingv
503 | # lingvistika
504 | log
505 | # logika
506 | lot
507 | # lotynų
508 | mat
509 | # matematika
510 | maž
511 | # mažybinis
512 | med
513 | # medicina
514 | medž
515 | # medžioklė
516 | men
517 | # menas
518 | menk
519 | # menkinamai
520 | metal
521 | # metalurgija
522 | meteor
523 | min
524 | # mineralogija
525 | mit
526 | # mitologija
527 | mok
528 | # mokyklinis
529 | ms
530 | # mįslė
531 | muz
532 | # muzikinis
533 | n
534 | # naujasis
535 | neig
536 | # neigiamasis
537 | neol
538 | # neologizmas
539 | niek
540 | # niekinamai
541 | ofic
542 | # oficialus
543 | opt
544 | # optika
545 | orig
546 | # original
547 | p
548 | # pietūs
549 | pan
550 | # panašiai
551 | parl
552 | # parlamentas
553 | pat
554 | # patarlė
555 | paž
556 | # pažodžiui
557 | plg
558 | # palygink
559 | poet
560 | # poetizmas
561 | poez
562 | #  poezija
563 | poligr
564 | # poligrafija
565 | polit
566 | # politika
567 | ppr
568 | # paprastai
569 | pranc
570 | pr
571 | # prancūzų, prūsų
572 | priet
573 | # prietaras
574 | prek
575 | # prekyba
576 | prk
577 | # perkeltine
578 | prs
579 | # persona, asmuo
580 | psn
581 | # pasenęs žodis
582 | psich
583 | # psichologija
584 | pvz
585 | # pavyzdžiui
586 | r
587 | # rytai
588 | rad
589 | # radiotechnika
590 | rel
591 | # religija
592 | ret
593 | # retai
594 | rus
595 | # rusų
596 | sen
597 | # senasis
598 | sl
599 | # slengas, slavų
600 | sov
601 | # sovietinis
602 | spec
603 | # specialus
604 | sport
605 | stat
606 | # statyba
607 | sudurt
608 | # sudurtinis
609 | sutr
610 | # sutrumpintas
611 | suv
612 | # suvalkiečių
613 | š
614 | # šiaurė
615 | šach
616 | # šachmatai
617 | šiaur
618 | škot
619 | # škotiškai
620 | šnek
621 | # šnekamoji
622 | teatr
623 | tech
624 | techn
625 | # technika
626 | teig
627 | # teigiamas
628 | teis
629 | # teisė
630 | tekst
631 | # tekstilė
632 | tel
633 | # telefonas
634 | teol
635 | # teologija
636 | v
637 | # tik vyriškosios, vakarai
638 | t.p
639 | t
640 | p
641 | # ir taip pat
642 | t.t
643 | # ir taip toliau
644 | t.y
645 | # tai yra
646 | vaik
647 | # vaikų
648 | vart
649 | # vartojama
650 | vet
651 | # veterinarija
652 | vid
653 | # vidurinis
654 | vksm
655 | # veiksmažodis
656 | vns
657 | # vienaskaita
658 | vok
659 | # vokiečių
660 | vulg
661 | # vulgariai
662 | zool
663 | # zoologija
664 | žr
665 | # žiūrėk
666 | ž.ū
667 | ž
668 | ū
669 | # žemės ūkis
670 | 
671 | # List of titles. These are often followed by upper-case names, but do
672 | # not indicate sentence breaks
673 | #
674 | # Jo Eminencija
675 | Em.
676 | # Gerbiamasis
677 | Gerb
678 | gerb
679 | #  malonus
680 | malon
681 | # profesorius
682 | Prof
683 | prof
684 | # daktaras (mokslų)
685 | Dr
686 | dr
687 | habil
688 | med
689 | # inž inžinierius
690 | inž
691 | Inž
692 | 
693 | 
694 | #Numbers only. These should only induce breaks when followed by a numeric sequence
695 | # add NUMERIC_ONLY after the word for this function
696 | #This case is mostly for the english "No." which can either be a sentence of its own, or
697 | #if followed by a number, a non-breaking prefix
698 | No #NUMERIC_ONLY#
699 | 


--------------------------------------------------------------------------------
/bpe/learn_bpe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
  6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
  7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | from __future__ import unicode_literals
 15 | 
 16 | import sys
 17 | import codecs
 18 | import re
 19 | import copy
 20 | import argparse
 21 | from collections import defaultdict, Counter
 22 | 
 23 | # hack for python2/3 compatibility
 24 | from io import open
 25 | argparse.open = open
 26 | 
 27 | def create_parser():
 28 |     parser = argparse.ArgumentParser(
 29 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 30 |         description="learn BPE-based word segmentation")
 31 | 
 32 |     parser.add_argument(
 33 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
 34 |         metavar='PATH',
 35 |         help="Input text (default: standard input).")
 36 | 
 37 |     parser.add_argument(
 38 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
 39 |         metavar='PATH',
 40 |         help="Output file for BPE codes (default: standard output)")
 41 |     parser.add_argument(
 42 |         '--symbols', '-s', type=int, default=10000,
 43 |         help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
 44 |     parser.add_argument(
 45 |         '--min-frequency', type=int, default=2, metavar='FREQ',
 46 |         help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
 47 |     parser.add_argument('--dict-input', action="store_true",
 48 |         help="If set, input file is interpreted as a dictionary where each line contains a word-count pair")
 49 |     parser.add_argument(
 50 |         '--verbose', '-v', action="store_true",
 51 |         help="verbose mode.")
 52 | 
 53 |     return parser
 54 | 
 55 | def get_vocabulary(fobj, is_dict=False):
 56 |     """Read text and return dictionary that encodes vocabulary
 57 |     """
 58 |     vocab = Counter()
 59 |     for line in fobj:
 60 |         if is_dict:
 61 |             word, count = line.strip().split()
 62 |             vocab[word] = int(count)
 63 |         else:
 64 |             for word in line.split():
 65 |                 vocab[word] += 1
 66 |     return vocab
 67 | 
 68 | def update_pair_statistics(pair, changed, stats, indices):
 69 |     """Minimally update the indices and frequency of symbol pairs
 70 | 
 71 |     if we merge a pair of symbols, only pairs that overlap with occurrences
 72 |     of this pair are affected, and need to be updated.
 73 |     """
 74 |     stats[pair] = 0
 75 |     indices[pair] = defaultdict(int)
 76 |     first, second = pair
 77 |     new_pair = first+second
 78 |     for j, word, old_word, freq in changed:
 79 | 
 80 |         # find all instances of pair, and update frequency/indices around it
 81 |         i = 0
 82 |         while True:
 83 |             # find first symbol
 84 |             try:
 85 |                 i = old_word.index(first, i)
 86 |             except ValueError:
 87 |                 break
 88 |             # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2])
 89 |             if i < len(old_word)-1 and old_word[i+1] == second:
 90 |                 # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B"
 91 |                 if i:
 92 |                     prev = old_word[i-1:i+1]
 93 |                     stats[prev] -= freq
 94 |                     indices[prev][j] -= 1
 95 |                 if i < len(old_word)-2:
 96 |                     # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B".
 97 |                     # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block
 98 |                     if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
 99 |                         nex = old_word[i+1:i+3]
100 |                         stats[nex] -= freq
101 |                         indices[nex][j] -= 1
102 |                 i += 2
103 |             else:
104 |                 i += 1
105 | 
106 |         i = 0
107 |         while True:
108 |             try:
109 |                 # find new pair
110 |                 i = word.index(new_pair, i)
111 |             except ValueError:
112 |                 break
113 |             # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC"
114 |             if i:
115 |                 prev = word[i-1:i+1]
116 |                 stats[prev] += freq
117 |                 indices[prev][j] += 1
118 |             # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B"
119 |             # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block
120 |             if i < len(word)-1 and word[i+1] != new_pair:
121 |                 nex = word[i:i+2]
122 |                 stats[nex] += freq
123 |                 indices[nex][j] += 1
124 |             i += 1
125 | 
126 | 
127 | def get_pair_statistics(vocab):
128 |     """Count frequency of all symbol pairs, and create index"""
129 | 
130 |     # data structure of pair frequencies
131 |     stats = defaultdict(int)
132 | 
133 |     #index from pairs to words
134 |     indices = defaultdict(lambda: defaultdict(int))
135 | 
136 |     for i, (word, freq) in enumerate(vocab):
137 |         prev_char = word[0]
138 |         for char in word[1:]:
139 |             stats[prev_char, char] += freq
140 |             indices[prev_char, char][i] += 1
141 |             prev_char = char
142 | 
143 |     return stats, indices
144 | 
145 | 
146 | def replace_pair(pair, vocab, indices):
147 |     """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
148 |     first, second = pair
149 |     pair_str = ''.join(pair)
150 |     pair_str = pair_str.replace('\\','\\\\')
151 |     changes = []
152 |     pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
153 |     if sys.version_info < (3, 0):
154 |         iterator = indices[pair].iteritems()
155 |     else:
156 |         iterator = indices[pair].items()
157 |     for j, freq in iterator:
158 |         if freq < 1:
159 |             continue
160 |         word, freq = vocab[j]
161 |         new_word = ' '.join(word)
162 |         new_word = pattern.sub(pair_str, new_word)
163 |         new_word = tuple(new_word.split())
164 | 
165 |         vocab[j] = (new_word, freq)
166 |         changes.append((j, new_word, word, freq))
167 | 
168 |     return changes
169 | 
170 | def prune_stats(stats, big_stats, threshold):
171 |     """Prune statistics dict for efficiency of max()
172 | 
173 |     The frequency of a symbol pair never increases, so pruning is generally safe
174 |     (until we the most frequent pair is less frequent than a pair we previously pruned)
175 |     big_stats keeps full statistics for when we need to access pruned items
176 |     """
177 |     for item,freq in list(stats.items()):
178 |         if freq < threshold:
179 |             del stats[item]
180 |             if freq < 0:
181 |                 big_stats[item] += freq
182 |             else:
183 |                 big_stats[item] = freq
184 | 
185 | 
186 | def main(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_dict=False):
187 |     """Learn num_symbols BPE operations from vocabulary, and write to outfile.
188 |     """
189 | 
190 |     # version 0.2 changes the handling of the end-of-word token ('</w>');
191 |     # version numbering allows bckward compatibility
192 |     outfile.write('#version: 0.2\n')
193 | 
194 |     vocab = get_vocabulary(infile, is_dict)
195 |     vocab = dict([(tuple(x[:-1])+(x[-1]+'</w>',) ,y) for (x,y) in vocab.items()])
196 |     sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
197 | 
198 |     stats, indices = get_pair_statistics(sorted_vocab)
199 |     big_stats = copy.deepcopy(stats)
200 |     # threshold is inspired by Zipfian assumption, but should only affect speed
201 |     threshold = max(stats.values()) / 10
202 |     for i in range(num_symbols):
203 |         if stats:
204 |             most_frequent = max(stats, key=lambda x: (stats[x], x))
205 | 
206 |         # we probably missed the best pair because of pruning; go back to full statistics
207 |         if not stats or (i and stats[most_frequent] < threshold):
208 |             prune_stats(stats, big_stats, threshold)
209 |             stats = copy.deepcopy(big_stats)
210 |             most_frequent = max(stats, key=lambda x: (stats[x], x))
211 |             # threshold is inspired by Zipfian assumption, but should only affect speed
212 |             threshold = stats[most_frequent] * i/(i+10000.0)
213 |             prune_stats(stats, big_stats, threshold)
214 | 
215 |         if stats[most_frequent] < min_frequency:
216 |             sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
217 |             break
218 | 
219 |         if verbose:
220 |             sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
221 |         outfile.write('{0} {1}\n'.format(*most_frequent))
222 |         changes = replace_pair(most_frequent, sorted_vocab, indices)
223 |         update_pair_statistics(most_frequent, changes, stats, indices)
224 |         stats[most_frequent] = 0
225 |         if not i % 100:
226 |             prune_stats(stats, big_stats, threshold)
227 | 
228 | 
229 | if __name__ == '__main__':
230 | 
231 |     # python 2/3 compatibility
232 |     if sys.version_info < (3, 0):
233 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
234 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
235 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
236 |     else:
237 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
238 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
239 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
240 | 
241 |     parser = create_parser()
242 |     args = parser.parse_args()
243 | 
244 |     # read/write files as UTF-8
245 |     if args.input.name != '<stdin>':
246 |         args.input = codecs.open(args.input.name, encoding='utf-8')
247 |     if args.output.name != '<stdout>':
248 |         args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
249 | 
250 |     main(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input)
251 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/bpe/apply_bpe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use operations learned with learn_bpe.py to encode a new text.
  6 | The text will not be smaller, but use only a fixed vocabulary, with rare words
  7 | encoded as variable-length sequences of subword units.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | from __future__ import unicode_literals, division
 15 | 
 16 | import sys
 17 | import codecs
 18 | import io
 19 | import argparse
 20 | import re
 21 | 
 22 | # hack for python2/3 compatibility
 23 | from io import open
 24 | argparse.open = open
 25 | 
 26 | class BPE(object):
 27 | 
 28 |     def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):
 29 | 
 30 |         # check version information
 31 |         firstline = codes.readline()
 32 |         if firstline.startswith('#version:'):
 33 |             self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
 34 |         else:
 35 |             self.version = (0, 1)
 36 |             codes.seek(0)
 37 | 
 38 |         self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]
 39 | 
 40 |         # some hacking to deal with duplicates (only consider first instance)
 41 |         self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
 42 | 
 43 |         self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])
 44 | 
 45 |         self.separator = separator
 46 | 
 47 |         self.vocab = vocab
 48 | 
 49 |         self.glossaries = glossaries if glossaries else []
 50 | 
 51 |         self.cache = {}
 52 | 
 53 |     def segment(self, sentence):
 54 |         """segment single sentence (whitespace-tokenized string) with BPE encoding"""
 55 |         output = []
 56 |         for word in sentence.split():
 57 |             new_word = [out for segment in self._isolate_glossaries(word)
 58 |                         for out in encode(segment,
 59 |                                           self.bpe_codes,
 60 |                                           self.bpe_codes_reverse,
 61 |                                           self.vocab,
 62 |                                           self.separator,
 63 |                                           self.version,
 64 |                                           self.cache,
 65 |                                           self.glossaries)]
 66 | 
 67 |             for item in new_word[:-1]:
 68 |                 output.append(item + self.separator)
 69 |             output.append(new_word[-1])
 70 | 
 71 |         return ' '.join(output)
 72 | 
 73 |     def _isolate_glossaries(self, word):
 74 |         word_segments = [word]
 75 |         for gloss in self.glossaries:
 76 |             word_segments = [out_segments for segment in word_segments
 77 |                                  for out_segments in isolate_glossary(segment, gloss)]
 78 |         return word_segments
 79 | 
 80 | def create_parser():
 81 |     parser = argparse.ArgumentParser(
 82 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 83 |         description="learn BPE-based word segmentation")
 84 | 
 85 |     parser.add_argument(
 86 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
 87 |         metavar='PATH',
 88 |         help="Input file (default: standard input).")
 89 |     parser.add_argument(
 90 |         '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
 91 |         required=True,
 92 |         help="File with BPE codes (created by learn_bpe.py).")
 93 |     parser.add_argument(
 94 |         '--merges', '-m', type=int, default=-1,
 95 |         metavar='INT',
 96 |         help="Use this many BPE operations (<= number of learned symbols)"+
 97 |              "default: Apply all the learned merge operations")
 98 |     parser.add_argument(
 99 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
100 |         metavar='PATH',
101 |         help="Output file (default: standard output)")
102 |     parser.add_argument(
103 |         '--separator', '-s', type=str, default='@@', metavar='STR',
104 |         help="Separator between non-final subword units (default: '%(default)s'))")
105 |     parser.add_argument(
106 |         '--vocabulary', type=argparse.FileType('r'), default=None,
107 |         metavar="PATH",
108 |         help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.")
109 |     parser.add_argument(
110 |         '--vocabulary-threshold', type=int, default=None,
111 |         metavar="INT",
112 |         help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV")
113 |     parser.add_argument(
114 |         '--glossaries', type=str, nargs='+', default=None,
115 |         metavar="STR",
116 |         help="Glossaries. The strings provided in glossaries will not be affected"+
117 |              "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords")
118 | 
119 |     return parser
120 | 
121 | def get_pairs(word):
122 |     """Return set of symbol pairs in a word.
123 | 
124 |     word is represented as tuple of symbols (symbols being variable-length strings)
125 |     """
126 |     pairs = set()
127 |     prev_char = word[0]
128 |     for char in word[1:]:
129 |         pairs.add((prev_char, char))
130 |         prev_char = char
131 |     return pairs
132 | 
133 | def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries=None):
134 |     """Encode word based on list of BPE merge operations, which are applied consecutively
135 |     """
136 | 
137 |     if orig in cache:
138 |         return cache[orig]
139 | 
140 |     if orig in glossaries:
141 |         cache[orig] = (orig,)
142 |         return (orig,)
143 | 
144 |     if version == (0, 1):
145 |         word = tuple(orig) + ('</w>',)
146 |     elif version == (0, 2): # more consistent handling of word-final segments
147 |         word = tuple(orig[:-1]) + ( orig[-1] + '</w>',)
148 |     else:
149 |         raise NotImplementedError
150 | 
151 |     pairs = get_pairs(word)
152 | 
153 |     if not pairs:
154 |         return orig
155 | 
156 |     while True:
157 |         bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
158 |         if bigram not in bpe_codes:
159 |             break
160 |         first, second = bigram
161 |         new_word = []
162 |         i = 0
163 |         while i < len(word):
164 |             try:
165 |                 j = word.index(first, i)
166 |                 new_word.extend(word[i:j])
167 |                 i = j
168 |             except:
169 |                 new_word.extend(word[i:])
170 |                 break
171 | 
172 |             if word[i] == first and i < len(word)-1 and word[i+1] == second:
173 |                 new_word.append(first+second)
174 |                 i += 2
175 |             else:
176 |                 new_word.append(word[i])
177 |                 i += 1
178 |         new_word = tuple(new_word)
179 |         word = new_word
180 |         if len(word) == 1:
181 |             break
182 |         else:
183 |             pairs = get_pairs(word)
184 | 
185 |     # don't print end-of-word symbols
186 |     if word[-1] == '</w>':
187 |         word = word[:-1]
188 |     elif word[-1].endswith('</w>'):
189 |         word = word[:-1] + (word[-1].replace('</w>',''),)
190 | 
191 |     if vocab:
192 |         word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator)
193 | 
194 |     cache[orig] = word
195 |     return word
196 | 
197 | def recursive_split(segment, bpe_codes, vocab, separator, final=False):
198 |     """Recursively split segment into smaller units (by reversing BPE merges)
199 |     until all units are either in-vocabulary, or cannot be split futher."""
200 | 
201 |     try:
202 |         if final:
203 |             left, right = bpe_codes[segment + '</w>']
204 |             right = right[:-4]
205 |         else:
206 |             left, right = bpe_codes[segment]
207 |     except:
208 |         #sys.stderr.write('cannot split {0} further.\n'.format(segment))
209 |         yield segment
210 |         return
211 | 
212 |     if left + separator in vocab:
213 |         yield left
214 |     else:
215 |         for item in recursive_split(left, bpe_codes, vocab, separator, False):
216 |             yield item
217 | 
218 |     if (final and right in vocab) or (not final and right + separator in vocab):
219 |         yield right
220 |     else:
221 |         for item in recursive_split(right, bpe_codes, vocab, separator, final):
222 |             yield item
223 | 
224 | def check_vocab_and_split(orig, bpe_codes, vocab, separator):
225 |     """Check for each segment in word if it is in-vocabulary,
226 |     and segment OOV segments into smaller units by reversing the BPE merge operations"""
227 | 
228 |     out = []
229 | 
230 |     for segment in orig[:-1]:
231 |         if segment + separator in vocab:
232 |             out.append(segment)
233 |         else:
234 |             #sys.stderr.write('OOV: {0}\n'.format(segment))
235 |             for item in recursive_split(segment, bpe_codes, vocab, separator, False):
236 |                 out.append(item)
237 | 
238 |     segment = orig[-1]
239 |     if segment in vocab:
240 |         out.append(segment)
241 |     else:
242 |         #sys.stderr.write('OOV: {0}\n'.format(segment))
243 |         for item in recursive_split(segment, bpe_codes, vocab, separator, True):
244 |             out.append(item)
245 | 
246 |     return out
247 | 
248 | 
249 | def read_vocabulary(vocab_file, threshold):
250 |     """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold.
251 |     """
252 | 
253 |     vocabulary = set()
254 | 
255 |     for line in vocab_file:
256 |         word, freq = line.split()
257 |         freq = int(freq)
258 |         if threshold == None or freq >= threshold:
259 |             vocabulary.add(word)
260 | 
261 |     return vocabulary
262 | 
263 | def isolate_glossary(word, glossary):
264 |     """
265 |     Isolate a glossary present inside a word.
266 | 
267 |     Returns a list of subwords. In which all 'glossary' glossaries are isolated 
268 | 
269 |     For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is:
270 |         ['1934', 'USA', 'B', 'USA']
271 |     """
272 |     if word == glossary or glossary not in word:
273 |         return [word]
274 |     else:
275 |         splits = word.split(glossary)
276 |         segments = [segment.strip() for split in splits[:-1] for segment in [split, glossary] if segment != '']
277 |         return segments + [splits[-1].strip()] if splits[-1] != '' else segments
278 | 
279 | if __name__ == '__main__':
280 | 
281 |     # python 2/3 compatibility
282 |     if sys.version_info < (3, 0):
283 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
284 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
285 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
286 |     else:
287 |         sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
288 |         sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
289 |         sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True)
290 | 
291 |     parser = create_parser()
292 |     args = parser.parse_args()
293 | 
294 |     # read/write files as UTF-8
295 |     args.codes = codecs.open(args.codes.name, encoding='utf-8')
296 |     if args.input.name != '<stdin>':
297 |         args.input = codecs.open(args.input.name, encoding='utf-8')
298 |     if args.output.name != '<stdout>':
299 |         args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
300 |     if args.vocabulary:
301 |         args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')
302 | 
303 |     if args.vocabulary:
304 |         vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
305 |     else:
306 |         vocabulary = None
307 | 
308 |     bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)
309 | 
310 |     for line in args.input:
311 |         args.output.write(bpe.segment(line).strip())
312 |         args.output.write('\n')
313 | 


--------------------------------------------------------------------------------
/scripts/tokenizer.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | use warnings;
  7 | 
  8 | # Sample Tokenizer
  9 | ### Version 1.1
 10 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
 11 | # Version 1.1 updates:
 12 | #       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
 13 | #       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
 14 | #       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
 15 | ### Version 1.0
 16 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
 17 | # written by Josh Schroeder, based on code by Philipp Koehn
 18 | 
 19 | binmode(STDIN, ":utf8");
 20 | binmode(STDOUT, ":utf8");
 21 | 
 22 | use warnings;
 23 | use FindBin qw($RealBin);
 24 | use strict;
 25 | use Time::HiRes;
 26 | 
 27 | if  (eval {require Thread;1;}) {
 28 |   #module loaded
 29 |   Thread->import();
 30 | }
 31 | 
 32 | my $mydir = "$RealBin//nonbreaking_prefixes";
 33 | 
 34 | my %NONBREAKING_PREFIX = ();
 35 | my @protected_patterns = ();
 36 | my $protected_patterns_file = "";
 37 | my $language = "en";
 38 | my $QUIET = 0;
 39 | my $HELP = 0;
 40 | my $AGGRESSIVE = 0;
 41 | my $SKIP_XML = 0;
 42 | my $TIMING = 0;
 43 | my $NUM_THREADS = 1;
 44 | my $NUM_SENTENCES_PER_THREAD = 2000;
 45 | my $PENN = 0;
 46 | my $NO_ESCAPING = 0;
 47 | while (@ARGV)
 48 | {
 49 | 	$_ = shift;
 50 | 	/^-b$/ && ($| = 1, next);
 51 | 	/^-l$/ && ($language = shift, next);
 52 | 	/^-q$/ && ($QUIET = 1, next);
 53 | 	/^-h$/ && ($HELP = 1, next);
 54 | 	/^-x$/ && ($SKIP_XML = 1, next);
 55 | 	/^-a$/ && ($AGGRESSIVE = 1, next);
 56 | 	/^-time$/ && ($TIMING = 1, next);
 57 |   # Option to add list of regexps to be protected
 58 |   /^-protected/ && ($protected_patterns_file = shift, next);
 59 | 	/^-threads$/ && ($NUM_THREADS = int(shift), next);
 60 | 	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
 61 | 	/^-penn$/ && ($PENN = 1, next);
 62 | 	/^-no-escape/ && ($NO_ESCAPING = 1, next);
 63 | }
 64 | 
 65 | # for time calculation
 66 | my $start_time;
 67 | if ($TIMING)
 68 | {
 69 |     $start_time = [ Time::HiRes::gettimeofday( ) ];
 70 | }
 71 | 
 72 | # print help message
 73 | if ($HELP)
 74 | {
 75 | 	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
 76 |         print "Options:\n";
 77 |         print "  -q     ... quiet.\n";
 78 |         print "  -a     ... aggressive hyphen splitting.\n";
 79 |         print "  -b     ... disable Perl buffering.\n";
 80 |         print "  -time  ... enable processing time calculation.\n";
 81 |         print "  -penn  ... use Penn treebank-like tokenization.\n";
 82 |         print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
 83 | 	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
 84 | 	exit;
 85 | }
 86 | 
 87 | if (!$QUIET)
 88 | {
 89 | 	print STDERR "Tokenizer Version 1.1\n";
 90 | 	print STDERR "Language: $language\n";
 91 | 	print STDERR "Number of threads: $NUM_THREADS\n";
 92 | }
 93 | 
 94 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
 95 | load_prefixes($language,\%NONBREAKING_PREFIX);
 96 | 
 97 | if (scalar(%NONBREAKING_PREFIX) eq 0)
 98 | {
 99 | 	print STDERR "Warning: No known abbreviations for language '$language'\n";
100 | }
101 | 
102 | # Load protected patterns
103 | if ($protected_patterns_file)
104 | {
105 |   open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
106 |   while(<PP>) {
107 |     chomp;
108 |     push @protected_patterns, $_;
109 |   }
110 | }
111 | 
112 | my @batch_sentences = ();
113 | my @thread_list = ();
114 | my $count_sentences = 0;
115 | 
116 | if ($NUM_THREADS > 1)
117 | {# multi-threading tokenization
118 |     while(<STDIN>)
119 |     {
120 |         $count_sentences = $count_sentences + 1;
121 |         push(@batch_sentences, $_);
122 |         if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
123 |         {
124 |             # assign each thread work
125 |             for (my $i=0; $i<$NUM_THREADS; $i++)
126 |             {
127 |                 my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
128 |                 my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
129 |                 my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
130 |                 my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
131 |                 push(@thread_list, $new_thread);
132 |             }
133 |             foreach (@thread_list)
134 |             {
135 |                 my $tokenized_list = $_->join;
136 |                 foreach (@$tokenized_list)
137 |                 {
138 |                     print $_;
139 |                 }
140 |             }
141 |             # reset for the new run
142 |             @thread_list = ();
143 |             @batch_sentences = ();
144 |         }
145 |     }
146 |     # the last batch
147 |     if (scalar(@batch_sentences)>0)
148 |     {
149 |         # assign each thread work
150 |         for (my $i=0; $i<$NUM_THREADS; $i++)
151 |         {
152 |             my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
153 |             if ($start_index >= scalar(@batch_sentences))
154 |             {
155 |                 last;
156 |             }
157 |             my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
158 |             if ($end_index >= scalar(@batch_sentences))
159 |             {
160 |                 $end_index = scalar(@batch_sentences)-1;
161 |             }
162 |             my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
163 |             my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
164 |             push(@thread_list, $new_thread);
165 |         }
166 |         foreach (@thread_list)
167 |         {
168 |             my $tokenized_list = $_->join;
169 |             foreach (@$tokenized_list)
170 |             {
171 |                 print $_;
172 |             }
173 |         }
174 |     }
175 | }
176 | else
177 | {# single thread only
178 |     while(<STDIN>)
179 |     {
180 |         if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
181 |         {
182 |             #don't try to tokenize XML/HTML tag lines
183 |             print $_;
184 |         }
185 |         else
186 |         {
187 |             print &tokenize($_);
188 |         }
189 |     }
190 | }
191 | 
192 | if ($TIMING)
193 | {
194 |     my $duration = Time::HiRes::tv_interval( $start_time );
195 |     print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
196 |     print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
197 | }
198 | 
199 | #####################################################################################
200 | # subroutines afterward
201 | 
202 | # tokenize a batch of texts saved in an array
203 | # input: an array containing a batch of texts
204 | # return: another array containing a batch of tokenized texts for the input array
205 | sub tokenize_batch
206 | {
207 |     my(@text_list) = @_;
208 |     my(@tokenized_list) = ();
209 |     foreach (@text_list)
210 |     {
211 |         if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
212 |         {
213 |             #don't try to tokenize XML/HTML tag lines
214 |             push(@tokenized_list, $_);
215 |         }
216 |         else
217 |         {
218 |             push(@tokenized_list, &tokenize($_));
219 |         }
220 |     }
221 |     return \@tokenized_list;
222 | }
223 | 
224 | # the actual tokenize function which tokenizes one input string
225 | # input: one string
226 | # return: the tokenized string for the input string
227 | sub tokenize
228 | {
229 |     my($text) = @_;
230 | 
231 |     if ($PENN) {
232 |       return tokenize_penn($text);
233 |     }
234 | 
235 |     chomp($text);
236 |     $text = " $text ";
237 | 
238 |     # remove ASCII junk
239 |     $text =~ s/\s+/ /g;
240 |     $text =~ s/[\000-\037]//g;
241 | 
242 |     # Find protected patterns
243 |     my @protected = ();
244 |     foreach my $protected_pattern (@protected_patterns) {
245 |       my $t = $text;
246 |       while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
247 |         push @protected, $+{PATTERN};
248 |         $t = $+{TAIL};
249 |       }
250 |     }
251 | 
252 |     for (my $i = 0; $i < scalar(@protected); ++$i) {
253 |       my $subst = sprintf("THISISPROTECTED%.3d", $i);
254 |       $text =~ s,\Q$protected[$i], $subst ,g;
255 |     }
256 |     $text =~ s/ +/ /g;
257 |     $text =~ s/^ //g;
258 |     $text =~ s/ $//g;
259 | 
260 |     # seperate out all "other" special characters
261 |     $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
262 | 
263 |     # aggressive hyphen splitting
264 |     if ($AGGRESSIVE)
265 |     {
266 |         $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
267 |     }
268 | 
269 |     #multi-dots stay together
270 |     $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
271 |     while($text =~ /DOTMULTI\./)
272 |     {
273 |         $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
274 |         $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
275 |     }
276 | 
277 |     # seperate out "," except if within numbers (5,300)
278 |     #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
279 | 
280 |     # separate out "," except if within numbers (5,300)
281 |     # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
282 |     # first application uses up B so rule can't see B,C
283 |     # two-step version here may create extra spaces but these are removed later
284 |     # will also space digit,letter or letter,digit forms (redundant with next section)
285 |     $text =~ s/([^\p{IsN}])[,]/$1 , /g;
286 |     $text =~ s/[,]([^\p{IsN}])/ , $1/g;
287 |     
288 |     # separate "," after a number if it's the end of a sentence
289 |     $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
290 | 
291 |     # separate , pre and post number
292 |     #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
293 |     #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
294 | 
295 |     # turn `into '
296 |     #$text =~ s/\`/\'/g;
297 | 
298 |     #turn '' into "
299 |     #$text =~ s/\'\'/ \" /g;
300 | 
301 |     if ($language eq "en")
302 |     {
303 |         #split contractions right
304 |         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
305 |         $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
306 |         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
307 |         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
308 |         #special case for "1990's"
309 |         $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
310 |     }
311 |     elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga"))
312 |     {
313 |         #split contractions left
314 |         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
315 |         $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
316 |         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
317 |         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
318 |     }
319 |     else
320 |     {
321 |         $text =~ s/\'/ \' /g;
322 |     }
323 | 
324 |     #word token method
325 |     my @words = split(/\s/,$text);
326 |     $text = "";
327 |     for (my $i=0;$i<(scalar(@words));$i++)
328 |     {
329 |         my $word = $words[$i];
330 |         if ( $word =~ /^(\S+)\.$/)
331 |         {
332 |             my $pre = $1;
333 |             if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
334 |             {
335 |                 #no change
336 | 			}
337 |             elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
338 |             {
339 |                 #no change
340 |             }
341 |             else
342 |             {
343 |                 $word = $pre." .";
344 |             }
345 |         }
346 |         $text .= $word." ";
347 |     }
348 | 
349 |     # clean up extraneous spaces
350 |     $text =~ s/ +/ /g;
351 |     $text =~ s/^ //g;
352 |     $text =~ s/ $//g;
353 | 
354 |     # .' at end of sentence is missed
355 |     $text =~ s/\.\' ?$/ . ' /;
356 | 
357 |     # restore protected
358 |     for (my $i = 0; $i < scalar(@protected); ++$i) {
359 |       my $subst = sprintf("THISISPROTECTED%.3d", $i);
360 |       $text =~ s/$subst/$protected[$i]/g;
361 |     }
362 | 
363 |     #restore multi-dots
364 |     while($text =~ /DOTDOTMULTI/)
365 |     {
366 |         $text =~ s/DOTDOTMULTI/DOTMULTI./g;
367 |     }
368 |     $text =~ s/DOTMULTI/./g;
369 | 
370 |     #escape special chars
371 |     if (!$NO_ESCAPING)
372 |       {
373 | 	$text =~ s/\&/\&amp;/g;   # escape escape
374 | 	$text =~ s/\|/\&#124;/g;  # factor separator
375 | 	$text =~ s/\</\&lt;/g;    # xml
376 | 	$text =~ s/\>/\&gt;/g;    # xml
377 | 	$text =~ s/\'/\&apos;/g;  # xml
378 | 	$text =~ s/\"/\&quot;/g;  # xml
379 | 	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
380 | 	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
381 |       }
382 | 
383 |     #ensure final line break
384 |     $text .= "\n" unless $text =~ /\n$/;
385 | 
386 |     return $text;
387 | }
388 | 
389 | sub tokenize_penn
390 | {
391 |     # Improved compatibility with Penn Treebank tokenization.  Useful if
392 |     # the text is to later be parsed with a PTB-trained parser.
393 |     #
394 |     # Adapted from Robert MacIntyre's sed script:
395 |     #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
396 | 
397 |     my($text) = @_;
398 |     chomp($text);
399 | 
400 |     # remove ASCII junk
401 |     $text =~ s/\s+/ /g;
402 |     $text =~ s/[\000-\037]//g;
403 | 
404 |     # attempt to get correct directional quotes
405 |     $text =~ s/^``/`` /g;
406 |     $text =~ s/^"/`` /g;
407 |     $text =~ s/^`([^`])/` $1/g;
408 |     $text =~ s/^'/`  /g;
409 |     $text =~ s/([ ([{<])"/$1 `` /g;
410 |     $text =~ s/([ ([{<])``/$1 `` /g;
411 |     $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
412 |     $text =~ s/([ ([{<])'/$1 ` /g;
413 |     # close quotes handled at end
414 | 
415 |     $text =~ s=\.\.\.= _ELLIPSIS_ =g;
416 | 
417 |     # separate out "," except if within numbers (5,300)
418 |     $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
419 |     # separate , pre and post number
420 |     $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
421 |     $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
422 | 
423 |     #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
424 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
425 | 
426 |     # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
427 |     # the tokens should be merged prior to parsing with a PTB-trained parser
428 |     # (see syntax-hyphen-splitting.perl).
429 |     $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
430 | 
431 |     # Assume sentence tokenization has been done first, so split FINAL periods
432 |     # only.
433 |     $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
434 |     # however, we may as well split ALL question marks and exclamation points,
435 |     # since they shouldn't have the abbrev.-marker ambiguity problem
436 |     $text =~ s=([?!])= $1 =g;
437 | 
438 |     # parentheses, brackets, etc.
439 |     $text =~ s=([\]\[\(\){}<>])= $1 =g;
440 |     $text =~ s/\(/-LRB-/g;
441 |     $text =~ s/\)/-RRB-/g;
442 |     $text =~ s/\[/-LSB-/g;
443 |     $text =~ s/\]/-RSB-/g;
444 |     $text =~ s/{/-LCB-/g;
445 |     $text =~ s/}/-RCB-/g;
446 | 
447 |     $text =~ s=--= -- =g;
448 | 
449 |     # First off, add a space to the beginning and end of each line, to reduce
450 |     # necessary number of regexps.
451 |     $text =~ s=$= =;
452 |     $text =~ s=^= =;
453 | 
454 |     $text =~ s="= '' =g;
455 |     # possessive or close-single-quote
456 |     $text =~ s=([^'])' =$1 ' =g;
457 |     # as in it's, I'm, we'd
458 |     $text =~ s='([sSmMdD]) = '$1 =g;
459 |     $text =~ s='ll = 'll =g;
460 |     $text =~ s='re = 're =g;
461 |     $text =~ s='ve = 've =g;
462 |     $text =~ s=n't = n't =g;
463 |     $text =~ s='LL = 'LL =g;
464 |     $text =~ s='RE = 'RE =g;
465 |     $text =~ s='VE = 'VE =g;
466 |     $text =~ s=N'T = N'T =g;
467 | 
468 |     $text =~ s= ([Cc])annot = $1an not =g;
469 |     $text =~ s= ([Dd])'ye = $1' ye =g;
470 |     $text =~ s= ([Gg])imme = $1im me =g;
471 |     $text =~ s= ([Gg])onna = $1on na =g;
472 |     $text =~ s= ([Gg])otta = $1ot ta =g;
473 |     $text =~ s= ([Ll])emme = $1em me =g;
474 |     $text =~ s= ([Mm])ore'n = $1ore 'n =g;
475 |     $text =~ s= '([Tt])is = '$1 is =g;
476 |     $text =~ s= '([Tt])was = '$1 was =g;
477 |     $text =~ s= ([Ww])anna = $1an na =g;
478 | 
479 |     #word token method
480 |     my @words = split(/\s/,$text);
481 |     $text = "";
482 |     for (my $i=0;$i<(scalar(@words));$i++)
483 |     {
484 |         my $word = $words[$i];
485 |         if ( $word =~ /^(\S+)\.$/)
486 |         {
487 |             my $pre = $1;
488 |             if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
489 |             {
490 |                 #no change
491 |             }
492 |             elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
493 |             {
494 |                 #no change
495 |             }
496 |             else
497 |             {
498 |                 $word = $pre." .";
499 |             }
500 |         }
501 |         $text .= $word." ";
502 |     }
503 | 
504 |     # restore ellipses
505 |     $text =~ s=_ELLIPSIS_=\.\.\.=g;
506 | 
507 |     # clean out extra spaces
508 |     $text =~ s=  *= =g;
509 |     $text =~ s=^ *==g;
510 |     $text =~ s= *$==g;
511 | 
512 |     #escape special chars
513 |     $text =~ s/\&/\&amp;/g;   # escape escape
514 |     $text =~ s/\|/\&#124;/g;  # factor separator
515 |     $text =~ s/\</\&lt;/g;    # xml
516 |     $text =~ s/\>/\&gt;/g;    # xml
517 |     $text =~ s/\'/\&apos;/g;  # xml
518 |     $text =~ s/\"/\&quot;/g;  # xml
519 |     $text =~ s/\[/\&#91;/g;   # syntax non-terminal
520 |     $text =~ s/\]/\&#93;/g;   # syntax non-terminal
521 | 
522 |     #ensure final line break
523 |     $text .= "\n" unless $text =~ /\n$/;
524 | 
525 |     return $text;
526 | }
527 | 
528 | sub load_prefixes
529 | {
530 |     my ($language, $PREFIX_REF) = @_;
531 | 
532 |     my $prefixfile = "$mydir/nonbreaking_prefix.$language";
533 | 
534 |     #default back to English if we don't have a language-specific prefix file
535 |     if (!(-e $prefixfile))
536 |     {
537 |         $prefixfile = "$mydir/nonbreaking_prefix.en";
538 |         print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
539 |         die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
540 |     }
541 | 
542 |     if (-e "$prefixfile")
543 |     {
544 |         open(PREFIX, "<:utf8", "$prefixfile");
545 |         while (<PREFIX>)
546 |         {
547 |             my $item = $_;
548 |             chomp($item);
549 |             if (($item) && (substr($item,0,1) ne "#"))
550 |             {
551 |                 if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
552 |                 {
553 |                     $PREFIX_REF->{$1} = 2;
554 |                 }
555 |                 else
556 |                 {
557 |                     $PREFIX_REF->{$item} = 1;
558 |                 }
559 |             }
560 |         }
561 |         close(PREFIX);
562 |     }
563 | }
564 | 


--------------------------------------------------------------------------------
/scripts/nonbreaking_prefixes/nonbreaking_prefix.el:
--------------------------------------------------------------------------------
   1 | # Sigle letters in upper-case are usually abbreviations of names
   2 | Α
   3 | Β
   4 | Γ
   5 | Δ
   6 | Ε
   7 | Ζ
   8 | Η
   9 | Θ
  10 | Ι
  11 | Κ
  12 | Λ
  13 | Μ
  14 | Ν
  15 | Ξ
  16 | Ο
  17 | Π
  18 | Ρ
  19 | Σ
  20 | Τ
  21 | Υ
  22 | Φ
  23 | Χ
  24 | Ψ
  25 | Ω
  26 | 
  27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content).
  28 | Άθαν
  29 | Έγχρ
  30 | Έκθ
  31 | Έσδ
  32 | Έφ
  33 | Όμ
  34 | Α΄Έσδρ
  35 | Α΄Έσδ
  36 | Α΄Βασ
  37 | Α΄Θεσ
  38 | Α΄Ιω
  39 | Α΄Κορινθ
  40 | Α΄Κορ
  41 | Α΄Μακκ
  42 | Α΄Μακ
  43 | Α΄Πέτρ
  44 | Α΄Πέτ
  45 | Α΄Παραλ
  46 | Α΄Πε
  47 | Α΄Σαμ
  48 | Α΄Τιμ
  49 | Α΄Χρον
  50 | Α΄Χρ
  51 | Α.Β.Α
  52 | Α.Β
  53 | Α.Ε
  54 | Α.Κ.Τ.Ο
  55 | Αέθλ
  56 | Αέτ
  57 | Αίλ.Δ
  58 | Αίλ.Τακτ
  59 | Αίσ
  60 | Αββακ
  61 | Αβυδ
  62 | Αβ
  63 | Αγάκλ
  64 | Αγάπ
  65 | Αγάπ.Αμαρτ.Σ
  66 | Αγάπ.Γεωπ
  67 | Αγαθάγγ
  68 | Αγαθήμ
  69 | Αγαθιν
  70 | Αγαθοκλ
  71 | Αγαθρχ
  72 | Αγαθ
  73 | Αγαθ.Ιστ
  74 | Αγαλλ
  75 | Αγαπητ
  76 | Αγγ
  77 | Αγησ
  78 | Αγλ
  79 | Αγορ.Κ
  80 | Αγρο.Κωδ
  81 | Αγρ.Εξ
  82 | Αγρ.Κ
  83 | Αγ.Γρ
  84 | Αδριαν
  85 | Αδρ
  86 | Αετ
  87 | Αθάν
  88 | Αθήν
  89 | Αθήν.Επιγρ
  90 | Αθήν.Επιτ
  91 | Αθήν.Ιατρ
  92 | Αθήν.Μηχ
  93 | Αθανάσ
  94 | Αθαν
  95 | Αθηνί
  96 | Αθηναγ
  97 | Αθηνόδ
  98 | Αθ
  99 | Αθ.Αρχ
 100 | Αιλ
 101 | Αιλ.Επιστ
 102 | Αιλ.ΖΙ
 103 | Αιλ.ΠΙ
 104 | Αιλ.απ
 105 | Αιμιλ
 106 | Αιν.Γαζ
 107 | Αιν.Τακτ
 108 | Αισχίν
 109 | Αισχίν.Επιστ
 110 | Αισχ
 111 | Αισχ.Αγαμ
 112 | Αισχ.Αγ
 113 | Αισχ.Αλ
 114 | Αισχ.Ελεγ
 115 | Αισχ.Επτ.Θ
 116 | Αισχ.Ευμ
 117 | Αισχ.Ικέτ
 118 | Αισχ.Ικ
 119 | Αισχ.Περσ
 120 | Αισχ.Προμ.Δεσμ
 121 | Αισχ.Πρ
 122 | Αισχ.Χοηφ
 123 | Αισχ.Χο
 124 | Αισχ.απ
 125 | ΑιτΕ
 126 | Αιτ
 127 | Αλκ
 128 | Αλχιας
 129 | Αμ.Π.Ο
 130 | Αμβ
 131 | Αμμών
 132 | Αμ.
 133 | Αν.Πειθ.Συμβ.Δικ
 134 | Ανακρ
 135 | Ανακ
 136 | Αναμν.Τόμ
 137 | Αναπλ
 138 | Ανδ
 139 | Ανθλγος
 140 | Ανθστης
 141 | Αντισθ
 142 | Ανχης
 143 | Αν
 144 | Αποκ
 145 | Απρ
 146 | Απόδ
 147 | Απόφ
 148 | Απόφ.Νομ
 149 | Απ
 150 | Απ.Δαπ
 151 | Απ.Διατ
 152 | Απ.Επιστ
 153 | Αριθ
 154 | Αριστοτ
 155 | Αριστοφ
 156 | Αριστοφ.Όρν
 157 | Αριστοφ.Αχ
 158 | Αριστοφ.Βάτρ
 159 | Αριστοφ.Ειρ
 160 | Αριστοφ.Εκκλ
 161 | Αριστοφ.Θεσμ
 162 | Αριστοφ.Ιππ
 163 | Αριστοφ.Λυσ
 164 | Αριστοφ.Νεφ
 165 | Αριστοφ.Πλ
 166 | Αριστοφ.Σφ
 167 | Αριστ
 168 | Αριστ.Αθ.Πολ
 169 | Αριστ.Αισθ
 170 | Αριστ.Αν.Πρ
 171 | Αριστ.Ζ.Ι
 172 | Αριστ.Ηθ.Ευδ
 173 | Αριστ.Ηθ.Νικ
 174 | Αριστ.Κατ
 175 | Αριστ.Μετ
 176 | Αριστ.Πολ
 177 | Αριστ.Φυσιογν
 178 | Αριστ.Φυσ
 179 | Αριστ.Ψυχ
 180 | Αριστ.Ρητ
 181 | Αρμεν
 182 | Αρμ
 183 | Αρχ.Εκ.Καν.Δ
 184 | Αρχ.Ευβ.Μελ
 185 | Αρχ.Ιδ.Δ
 186 | Αρχ.Νομ
 187 | Αρχ.Ν
 188 | Αρχ.Π.Ε
 189 | Αρ
 190 | Αρ.Φορ.Μητρ
 191 | Ασμ
 192 | Ασμ.ασμ
 193 | Αστ.Δ
 194 | Αστ.Χρον
 195 | Ασ
 196 | Ατομ.Γνωμ
 197 | Αυγ
 198 | Αφρ
 199 | Αχ.Νομ
 200 | Α
 201 | Α.Εγχ.Π
 202 | Α.Κ.΄Υδρας
 203 | Β΄Έσδρ
 204 | Β΄Έσδ
 205 | Β΄Βασ
 206 | Β΄Θεσ
 207 | Β΄Ιω
 208 | Β΄Κορινθ
 209 | Β΄Κορ
 210 | Β΄Μακκ
 211 | Β΄Μακ
 212 | Β΄Πέτρ
 213 | Β΄Πέτ
 214 | Β΄Πέ
 215 | Β΄Παραλ
 216 | Β΄Σαμ
 217 | Β΄Τιμ
 218 | Β΄Χρον
 219 | Β΄Χρ
 220 | Β.Ι.Π.Ε
 221 | Β.Κ.Τ
 222 | Β.Κ.Ψ.Β
 223 | Β.Μ
 224 | Β.Ο.Α.Κ
 225 | Β.Ο.Α
 226 | Β.Ο.Δ
 227 | Βίβλ
 228 | Βαρ
 229 | ΒεΘ
 230 | Βι.Περ
 231 | Βιπερ
 232 | Βιργ
 233 | Βλγ
 234 | Βούλ
 235 | Βρ
 236 | Γ΄Βασ
 237 | Γ΄Μακκ
 238 | ΓΕΝμλ
 239 | Γέν
 240 | Γαλ
 241 | Γεν
 242 | Γλ
 243 | Γν.Ν.Σ.Κρ
 244 | Γνωμ
 245 | Γν
 246 | Γράμμ
 247 | Γρηγ.Ναζ
 248 | Γρηγ.Νύσ
 249 | Γ Νοσ
 250 | Γ' Ογκολ
 251 | Γ.Ν
 252 | Δ΄Βασ
 253 | Δ.Β
 254 | Δ.Δίκη
 255 | Δ.Δίκ
 256 | Δ.Ε.Σ
 257 | Δ.Ε.Φ.Α
 258 | Δ.Ε.Φ
 259 | Δ.Εργ.Ν
 260 | Δαμ
 261 | Δαμ.μνημ.έργ
 262 | Δαν
 263 | Δασ.Κ
 264 | Δεκ
 265 | Δελτ.Δικ.Ε.Τ.Ε
 266 | Δελτ.Νομ
 267 | Δελτ.Συνδ.Α.Ε
 268 | Δερμ
 269 | Δευτ
 270 | Δεύτ
 271 | Δημοσθ
 272 | Δημόκρ
 273 | Δι.Δικ
 274 | Διάτ
 275 | Διαιτ.Απ
 276 | Διαιτ
 277 | Διαρκ.Στρατ
 278 | Δικ
 279 | Διοίκ.Πρωτ
 280 | ΔιοικΔνη
 281 | Διοικ.Εφ
 282 | Διον.Αρ
 283 | Διόρθ.Λαθ
 284 | Δ.κ.Π
 285 | Δνη
 286 | Δν
 287 | Δογμ.Όρος
 288 | Δρ
 289 | Δ.τ.Α
 290 | Δτ
 291 | ΔωδΝομ
 292 | Δ.Περ
 293 | Δ.Στρ
 294 | ΕΔΠολ
 295 | ΕΕυρΚ
 296 | ΕΙΣ
 297 | ΕΝαυτΔ
 298 | ΕΣΑμΕΑ
 299 | ΕΣΘ
 300 | ΕΣυγκΔ
 301 | ΕΤρΑξΧρΔ
 302 | Ε.Φ.Ε.Τ
 303 | Ε.Φ.Ι
 304 | Ε.Φ.Ο.Επ.Α
 305 | Εβδ
 306 | Εβρ
 307 | Εγκύκλ.Επιστ
 308 | Εγκ
 309 | Εε.Αιγ
 310 | Εθν.Κ.Τ
 311 | Εθν
 312 | Ειδ.Δικ.Αγ.Κακ
 313 | Εικ
 314 | Ειρ.Αθ
 315 | Ειρην.Αθ
 316 | Ειρην
 317 | Έλεγχ
 318 | Ειρ
 319 | Εισ.Α.Π
 320 | Εισ.Ε
 321 | Εισ.Ν.Α.Κ
 322 | Εισ.Ν.Κ.Πολ.Δ
 323 | Εισ.Πρωτ
 324 | Εισηγ.Έκθ
 325 | Εισ
 326 | Εκκλ
 327 | Εκκ
 328 | Εκ
 329 | Ελλ.Δνη
 330 | Εν.Ε
 331 | Εξ
 332 | Επ.Αν
 333 | Επ.Εργ.Δ
 334 | Επ.Εφ
 335 | Επ.Κυπ.Δ
 336 | Επ.Μεσ.Αρχ
 337 | Επ.Νομ
 338 | Επίκτ
 339 | Επίκ
 340 | Επι.Δ.Ε
 341 | Επιθ.Ναυτ.Δικ
 342 | Επικ
 343 | Επισκ.Ε.Δ
 344 | Επισκ.Εμπ.Δικ
 345 | Επιστ.Επετ.Αρμ
 346 | Επιστ.Επετ
 347 | Επιστ.Ιερ
 348 | Επιτρ.Προστ.Συνδ.Στελ
 349 | Επιφάν
 350 | Επτ.Εφ
 351 | Επ.Ιρ
 352 | Επ.Ι
 353 | Εργ.Ασφ.Νομ
 354 | Ερμ.Α.Κ
 355 | Ερμη.Σ
 356 | Εσθ
 357 | Εσπερ
 358 | Ετρ.Δ
 359 | Ευκλ
 360 | Ευρ.Δ.Δ.Α
 361 | Ευρ.Σ.Δ.Α
 362 | Ευρ.ΣτΕ
 363 | Ευρατόμ
 364 | Ευρ.Άλκ
 365 | Ευρ.Ανδρομ
 366 | Ευρ.Βάκχ
 367 | Ευρ.Εκ
 368 | Ευρ.Ελ
 369 | Ευρ.Ηλ
 370 | Ευρ.Ηρακ
 371 | Ευρ.Ηρ
 372 | Ευρ.Ηρ.Μαιν
 373 | Ευρ.Ικέτ
 374 | Ευρ.Ιππόλ
 375 | Ευρ.Ιφ.Α
 376 | Ευρ.Ιφ.Τ
 377 | Ευρ.Ι.Τ
 378 | Ευρ.Κύκλ
 379 | Ευρ.Μήδ
 380 | Ευρ.Ορ
 381 | Ευρ.Ρήσ
 382 | Ευρ.Τρωάδ
 383 | Ευρ.Φοίν
 384 | Εφ.Αθ
 385 | Εφ.Εν
 386 | Εφ.Επ
 387 | Εφ.Θρ
 388 | Εφ.Θ
 389 | Εφ.Ι
 390 | Εφ.Κερ
 391 | Εφ.Κρ
 392 | Εφ.Λ
 393 | Εφ.Ν
 394 | Εφ.Πατ
 395 | Εφ.Πειρ
 396 | Εφαρμ.Δ.Δ
 397 | Εφαρμ
 398 | Εφεσ
 399 | Εφημ
 400 | Εφ
 401 | Ζαχ
 402 | Ζιγ
 403 | Ζυ
 404 | Ζχ
 405 | ΗΕ.Δ
 406 | Ημερ
 407 | Ηράκλ
 408 | Ηροδ
 409 | Ησίοδ
 410 | Ησ
 411 | Η.Ε.Γ
 412 | ΘΗΣ
 413 | ΘΡ
 414 | Θαλ
 415 | Θεοδ
 416 | Θεοφ
 417 | Θεσ
 418 | Θεόδ.Μοψ
 419 | Θεόκρ
 420 | Θεόφιλ
 421 | Θουκ
 422 | Θρ
 423 | Θρ.Ε
 424 | Θρ.Ιερ
 425 | Θρ.Ιρ
 426 | Ιακ
 427 | Ιαν
 428 | Ιβ
 429 | Ιδθ
 430 | Ιδ
 431 | Ιεζ
 432 | Ιερ
 433 | Ιζ
 434 | Ιησ
 435 | Ιησ.Ν
 436 | Ικ
 437 | Ιλ
 438 | Ιν
 439 | Ιουδ
 440 | Ιουστ
 441 | Ιούδα
 442 | Ιούλ
 443 | Ιούν
 444 | Ιπποκρ
 445 | Ιππόλ
 446 | Ιρ
 447 | Ισίδ.Πηλ
 448 | Ισοκρ
 449 | Ισ.Ν
 450 | Ιωβ
 451 | Ιωλ
 452 | Ιων
 453 | Ιω
 454 | ΚΟΣ
 455 | ΚΟ.ΜΕ.ΚΟΝ
 456 | ΚΠοινΔ
 457 | ΚΠολΔ
 458 | ΚαΒ
 459 | Καλ
 460 | Καλ.Τέχν
 461 | ΚανΒ
 462 | Καν.Διαδ
 463 | Κατάργ
 464 | Κλ
 465 | ΚοινΔ
 466 | Κολσ
 467 | Κολ
 468 | Κον
 469 | Κορ
 470 | Κος
 471 | ΚριτΕπιθ
 472 | ΚριτΕ
 473 | Κριτ
 474 | Κρ
 475 | ΚτΒ
 476 | ΚτΕ
 477 | ΚτΠ
 478 | Κυβ
 479 | Κυπρ
 480 | Κύριλ.Αλεξ
 481 | Κύριλ.Ιερ
 482 | Λεβ
 483 | Λεξ.Σουίδα
 484 | Λευϊτ
 485 | Λευ
 486 | Λκ
 487 | Λογ
 488 | ΛουκΑμ
 489 | Λουκιαν
 490 | Λουκ.Έρωτ
 491 | Λουκ.Ενάλ.Διάλ
 492 | Λουκ.Ερμ
 493 | Λουκ.Εταιρ.Διάλ
 494 | Λουκ.Ε.Δ
 495 | Λουκ.Θε.Δ
 496 | Λουκ.Ικ.
 497 | Λουκ.Ιππ
 498 | Λουκ.Λεξιφ
 499 | Λουκ.Μεν
 500 | Λουκ.Μισθ.Συν
 501 | Λουκ.Ορχ
 502 | Λουκ.Περ
 503 | Λουκ.Συρ
 504 | Λουκ.Τοξ
 505 | Λουκ.Τυρ
 506 | Λουκ.Φιλοψ
 507 | Λουκ.Φιλ
 508 | Λουκ.Χάρ
 509 | Λουκ.
 510 | Λουκ.Αλ
 511 | Λοχ
 512 | Λυδ
 513 | Λυκ
 514 | Λυσ
 515 | Λωζ
 516 | Λ1
 517 | Λ2
 518 | ΜΟΕφ
 519 | Μάρκ
 520 | Μέν
 521 | Μαλ
 522 | Ματθ
 523 | Μα
 524 | Μιχ
 525 | Μκ
 526 | Μλ
 527 | Μμ
 528 | Μον.Δ.Π
 529 | Μον.Πρωτ
 530 | Μον
 531 | Μρ
 532 | Μτ
 533 | Μχ
 534 | Μ.Βασ
 535 | Μ.Πλ
 536 | ΝΑ
 537 | Ναυτ.Χρον
 538 | Να
 539 | Νδικ
 540 | Νεεμ
 541 | Νε
 542 | Νικ
 543 | ΝκΦ
 544 | Νμ
 545 | ΝοΒ
 546 | Νομ.Δελτ.Τρ.Ελ
 547 | Νομ.Δελτ
 548 | Νομ.Σ.Κ
 549 | Νομ.Χρ
 550 | Νομ
 551 | Νομ.Διεύθ
 552 | Νοσ
 553 | Ντ
 554 | Νόσων
 555 | Ν1
 556 | Ν2
 557 | Ν3
 558 | Ν4
 559 | Νtot
 560 | Ξενοφ
 561 | Ξεν
 562 | Ξεν.Ανάβ
 563 | Ξεν.Απολ
 564 | Ξεν.Απομν
 565 | Ξεν.Απομ
 566 | Ξεν.Ελλ
 567 | Ξεν.Ιέρ
 568 | Ξεν.Ιππαρχ
 569 | Ξεν.Ιππ
 570 | Ξεν.Κυρ.Αν
 571 | Ξεν.Κύρ.Παιδ
 572 | Ξεν.Κ.Π
 573 | Ξεν.Λακ.Πολ
 574 | Ξεν.Οικ
 575 | Ξεν.Προσ
 576 | Ξεν.Συμπόσ
 577 | Ξεν.Συμπ
 578 | Ο΄
 579 | Οβδ
 580 | Οβ
 581 | ΟικΕ
 582 | Οικ
 583 | Οικ.Πατρ
 584 | Οικ.Σύν.Βατ
 585 | Ολομ
 586 | Ολ
 587 | Ολ.Α.Π
 588 | Ομ.Ιλ
 589 | Ομ.Οδ
 590 | ΟπΤοιχ
 591 | Οράτ
 592 | Ορθ
 593 | ΠΡΟ.ΠΟ
 594 | Πίνδ
 595 | Πίνδ.Ι
 596 | Πίνδ.Νεμ
 597 | Πίνδ.Ν
 598 | Πίνδ.Ολ
 599 | Πίνδ.Παθ
 600 | Πίνδ.Πυθ
 601 | Πίνδ.Π
 602 | ΠαγΝμλγ
 603 | Παν
 604 | Παρμ
 605 | Παροιμ
 606 | Παρ
 607 | Παυσ
 608 | Πειθ.Συμβ
 609 | ΠειρΝ
 610 | Πελ
 611 | ΠεντΣτρ
 612 | Πεντ
 613 | Πεντ.Εφ
 614 | ΠερΔικ
 615 | Περ.Γεν.Νοσ
 616 | Πετ
 617 | Πλάτ
 618 | Πλάτ.Αλκ
 619 | Πλάτ.Αντ
 620 | Πλάτ.Αξίοχ
 621 | Πλάτ.Απόλ
 622 | Πλάτ.Γοργ
 623 | Πλάτ.Ευθ
 624 | Πλάτ.Θεαίτ
 625 | Πλάτ.Κρατ
 626 | Πλάτ.Κριτ
 627 | Πλάτ.Λύσ
 628 | Πλάτ.Μεν
 629 | Πλάτ.Νόμ
 630 | Πλάτ.Πολιτ
 631 | Πλάτ.Πολ
 632 | Πλάτ.Πρωτ
 633 | Πλάτ.Σοφ.
 634 | Πλάτ.Συμπ
 635 | Πλάτ.Τίμ
 636 | Πλάτ.Φαίδρ
 637 | Πλάτ.Φιλ
 638 | Πλημ
 639 | Πλούτ
 640 | Πλούτ.Άρατ
 641 | Πλούτ.Αιμ
 642 | Πλούτ.Αλέξ
 643 | Πλούτ.Αλκ
 644 | Πλούτ.Αντ
 645 | Πλούτ.Αρτ
 646 | Πλούτ.Ηθ
 647 | Πλούτ.Θεμ
 648 | Πλούτ.Κάμ
 649 | Πλούτ.Καίσ
 650 | Πλούτ.Κικ
 651 | Πλούτ.Κράσ
 652 | Πλούτ.Κ
 653 | Πλούτ.Λυκ
 654 | Πλούτ.Μάρκ
 655 | Πλούτ.Μάρ
 656 | Πλούτ.Περ
 657 | Πλούτ.Ρωμ
 658 | Πλούτ.Σύλλ
 659 | Πλούτ.Φλαμ
 660 | Πλ
 661 | Ποιν.Δικ
 662 | Ποιν.Δ
 663 | Ποιν.Ν
 664 | Ποιν.Χρον
 665 | Ποιν.Χρ
 666 | Πολ.Δ
 667 | Πολ.Πρωτ
 668 | Πολ
 669 | Πολ.Μηχ
 670 | Πολ.Μ
 671 | Πρακτ.Αναθ
 672 | Πρακτ.Ολ
 673 | Πραξ
 674 | Πρμ
 675 | Πρξ
 676 | Πρωτ
 677 | Πρ
 678 | Πρ.Αν
 679 | Πρ.Λογ
 680 | Πταισμ
 681 | Πυρ.Καλ
 682 | Πόλη
 683 | Π.Δ
 684 | Π.Δ.Άσμ
 685 | ΡΜ.Ε
 686 | Ρθ
 687 | Ρμ
 688 | Ρωμ
 689 | ΣΠλημ
 690 | Σαπφ
 691 | Σειρ
 692 | Σολ
 693 | Σοφ
 694 | Σοφ.Αντιγ
 695 | Σοφ.Αντ
 696 | Σοφ.Αποσ
 697 | Σοφ.Απ
 698 | Σοφ.Ηλέκ
 699 | Σοφ.Ηλ
 700 | Σοφ.Οιδ.Κολ
 701 | Σοφ.Οιδ.Τύρ
 702 | Σοφ.Ο.Τ
 703 | Σοφ.Σειρ
 704 | Σοφ.Σολ
 705 | Σοφ.Τραχ
 706 | Σοφ.Φιλοκτ
 707 | Σρ
 708 | Σ.τ.Ε
 709 | Σ.τ.Π
 710 | Στρ.Π.Κ
 711 | Στ.Ευρ
 712 | Συζήτ
 713 | Συλλ.Νομολ
 714 | Συλ.Νομ
 715 | ΣυμβΕπιθ
 716 | Συμπ.Ν
 717 | Συνθ.Αμ
 718 | Συνθ.Ε.Ε
 719 | Συνθ.Ε.Κ
 720 | Συνθ.Ν
 721 | Σφν
 722 | Σφ
 723 | Σφ.Σλ
 724 | Σχ.Πολ.Δ
 725 | Σχ.Συντ.Ε
 726 | Σωσ
 727 | Σύντ
 728 | Σ.Πληρ
 729 | ΤΘ
 730 | ΤΣ.Δ
 731 | Τίτ
 732 | Τβ
 733 | Τελ.Ενημ
 734 | Τελ.Κ
 735 | Τερτυλ
 736 | Τιμ
 737 | Τοπ.Α
 738 | Τρ.Ο
 739 | Τριμ
 740 | Τριμ.Πλ
 741 | Τρ.Πλημ
 742 | Τρ.Π.Δ
 743 | Τ.τ.Ε
 744 | Ττ
 745 | Τωβ
 746 | Υγ
 747 | Υπερ
 748 | Υπ
 749 | Υ.Γ
 750 | Φιλήμ
 751 | Φιλιπ
 752 | Φιλ
 753 | Φλμ
 754 | Φλ
 755 | Φορ.Β
 756 | Φορ.Δ.Ε
 757 | Φορ.Δνη
 758 | Φορ.Δ
 759 | Φορ.Επ
 760 | Φώτ
 761 | Χρ.Ι.Δ
 762 | Χρ.Ιδ.Δ
 763 | Χρ.Ο
 764 | Χρυσ
 765 | Ψήφ
 766 | Ψαλμ
 767 | Ψαλ
 768 | Ψλ
 769 | Ωριγ
 770 | Ωσ
 771 | Ω.Ρ.Λ
 772 | άγν
 773 | άγν.ετυμολ
 774 | άγ
 775 | άκλ
 776 | άνθρ
 777 | άπ
 778 | άρθρ
 779 | άρν
 780 | άρ
 781 | άτ
 782 | άψ
 783 | ά
 784 | έκδ
 785 | έκφρ
 786 | έμψ
 787 | ένθ.αν
 788 | έτ
 789 | έ.α
 790 | ίδ
 791 | αβεστ
 792 | αβησσ
 793 | αγγλ
 794 | αγγ
 795 | αδημ
 796 | αεροναυτ
 797 | αερον
 798 | αεροπ
 799 | αθλητ
 800 | αθλ
 801 | αθροιστ
 802 | αιγυπτ
 803 | αιγ
 804 | αιτιολ
 805 | αιτ
 806 | αι
 807 | ακαδ
 808 | ακκαδ
 809 | αλβ
 810 | αλλ
 811 | αλφαβητ
 812 | αμα
 813 | αμερικ
 814 | αμερ
 815 | αμετάβ
 816 | αμτβ
 817 | αμφιβ
 818 | αμφισβ
 819 | αμφ
 820 | αμ
 821 | ανάλ
 822 | ανάπτ
 823 | ανάτ
 824 | αναβ
 825 | αναδαν
 826 | αναδιπλασ
 827 | αναδιπλ
 828 | αναδρ
 829 | αναλ
 830 | αναν
 831 | ανασυλλ
 832 | ανατολ
 833 | ανατομ
 834 | ανατυπ
 835 | ανατ
 836 | αναφορ
 837 | αναφ
 838 | ανα.ε
 839 | ανδρων
 840 | ανθρωπολ
 841 | ανθρωπ
 842 | ανθ
 843 | ανομ
 844 | αντίτ
 845 | αντδ
 846 | αντιγρ
 847 | αντιθ
 848 | αντικ
 849 | αντιμετάθ
 850 | αντων
 851 | αντ
 852 | ανωτ
 853 | ανόργ
 854 | ανών
 855 | αορ
 856 | απαρέμφ
 857 | απαρφ
 858 | απαρχ
 859 | απαρ
 860 | απλολ
 861 | απλοπ
 862 | αποβ
 863 | αποηχηροπ
 864 | αποθ
 865 | αποκρυφ
 866 | αποφ
 867 | απρμφ
 868 | απρφ
 869 | απρόσ
 870 | απόδ
 871 | απόλ
 872 | απόσπ
 873 | απόφ
 874 | αραβοτουρκ
 875 | αραβ
 876 | αραμ
 877 | αρβαν
 878 | αργκ
 879 | αριθμτ
 880 | αριθμ
 881 | αριθ
 882 | αρκτικόλ
 883 | αρκ
 884 | αρμεν
 885 | αρμ
 886 | αρνητ
 887 | αρσ
 888 | αρχαιολ
 889 | αρχιτεκτ
 890 | αρχιτ
 891 | αρχκ
 892 | αρχ
 893 | αρωμουν
 894 | αρωμ
 895 | αρ
 896 | αρ.μετρ
 897 | αρ.φ
 898 | ασσυρ
 899 | αστρολ
 900 | αστροναυτ
 901 | αστρον
 902 | αττ
 903 | αυστραλ
 904 | αυτοπ
 905 | αυτ
 906 | αφγαν
 907 | αφηρ
 908 | αφομ
 909 | αφρικ
 910 | αχώρ
 911 | αόρ
 912 | α.α
 913 | α/α
 914 | α0
 915 | βαθμ
 916 | βαθ
 917 | βαπτ
 918 | βασκ
 919 | βεβαιωτ
 920 | βεβ
 921 | βεδ
 922 | βενετ
 923 | βεν
 924 | βερβερ
 925 | βιβλγρ
 926 | βιολ
 927 | βιομ
 928 | βιοχημ
 929 | βιοχ
 930 | βλάχ
 931 | βλ
 932 | βλ.λ
 933 | βοταν
 934 | βοτ
 935 | βουλγαρ
 936 | βουλγ
 937 | βούλ
 938 | βραζιλ
 939 | βρετον
 940 | βόρ
 941 | γαλλ
 942 | γενικότ
 943 | γενοβ
 944 | γεν
 945 | γερμαν
 946 | γερμ
 947 | γεωγρ
 948 | γεωλ
 949 | γεωμετρ
 950 | γεωμ
 951 | γεωπ
 952 | γεωργ
 953 | γλυπτ
 954 | γλωσσολ
 955 | γλωσσ
 956 | γλ
 957 | γνμδ
 958 | γνμ
 959 | γνωμ
 960 | γοτθ
 961 | γραμμ
 962 | γραμ
 963 | γρμ
 964 | γρ
 965 | γυμν
 966 | δίδες
 967 | δίκ
 968 | δίφθ
 969 | δαν
 970 | δεικτ
 971 | δεκατ
 972 | δηλ
 973 | δημογρ
 974 | δημοτ
 975 | δημώδ
 976 | δημ
 977 | διάγρ
 978 | διάκρ
 979 | διάλεξ
 980 | διάλ
 981 | διάσπ
 982 | διαλεκτ
 983 | διατρ
 984 | διαφ
 985 | διαχ
 986 | διδα
 987 | διεθν
 988 | διεθ
 989 | δικον
 990 | διστ
 991 | δισύλλ
 992 | δισ
 993 | διφθογγοπ
 994 | δογμ
 995 | δολ
 996 | δοτ
 997 | δρμ
 998 | δρχ
 999 | δρ(α)
1000 | δωρ
1001 | δ
1002 | εβρ
1003 | εγκλπ
1004 | εδ
1005 | εθνολ
1006 | εθν
1007 | ειδικότ
1008 | ειδ
1009 | ειδ.β
1010 | εικ
1011 | ειρ
1012 | εισ
1013 | εκατοστμ
1014 | εκατοστ
1015 | εκατστ.2
1016 | εκατστ.3
1017 | εκατ
1018 | εκδ
1019 | εκκλησ
1020 | εκκλ
1021 | εκ
1022 | ελλην
1023 | ελλ
1024 | ελνστ
1025 | ελπ
1026 | εμβ
1027 | εμφ
1028 | εναλλ
1029 | ενδ
1030 | ενεργ
1031 | ενεστ
1032 | ενικ
1033 | ενν
1034 | εν
1035 | εξέλ
1036 | εξακολ
1037 | εξομάλ
1038 | εξ
1039 | εο
1040 | επέκτ
1041 | επίδρ
1042 | επίθ
1043 | επίρρ
1044 | επίσ
1045 | επαγγελμ
1046 | επανάλ
1047 | επανέκδ
1048 | επιθ
1049 | επικ
1050 | επιμ
1051 | επιρρ
1052 | επιστ
1053 | επιτατ
1054 | επιφ
1055 | επών
1056 | επ
1057 | εργ
1058 | ερμ
1059 | ερρινοπ
1060 | ερωτ
1061 | ετρουσκ
1062 | ετυμ
1063 | ετ
1064 | ευφ
1065 | ευχετ
1066 | εφ
1067 | εύχρ
1068 | ε.α
1069 | ε/υ
1070 | ε0
1071 | ζωγρ
1072 | ζωολ
1073 | ηθικ
1074 | ηθ
1075 | ηλεκτρολ
1076 | ηλεκτρον
1077 | ηλεκτρ
1078 | ημίτ
1079 | ημίφ
1080 | ημιφ
1081 | ηχηροπ
1082 | ηχηρ
1083 | ηχομιμ
1084 | ηχ
1085 | η
1086 | θέατρ
1087 | θεολ
1088 | θετ
1089 | θηλ
1090 | θρακ
1091 | θρησκειολ
1092 | θρησκ
1093 | θ
1094 | ιαπων
1095 | ιατρ
1096 | ιδιωμ
1097 | ιδ
1098 | ινδ
1099 | ιραν
1100 | ισπαν
1101 | ιστορ
1102 | ιστ
1103 | ισχυροπ
1104 | ιταλ
1105 | ιχθυολ
1106 | ιων
1107 | κάτ
1108 | καθ
1109 | κακοσ
1110 | καν
1111 | καρ
1112 | κατάλ
1113 | κατατ
1114 | κατωτ
1115 | κατ
1116 | κα
1117 | κελτ
1118 | κεφ
1119 | κινεζ
1120 | κινημ
1121 | κλητ
1122 | κλιτ
1123 | κλπ
1124 | κλ
1125 | κν
1126 | κοινωνιολ
1127 | κοινων
1128 | κοπτ
1129 | κουτσοβλαχ
1130 | κουτσοβλ
1131 | κπ
1132 | κρ.γν
1133 | κτγ
1134 | κτην
1135 | κτητ
1136 | κτλ
1137 | κτ
1138 | κυριολ
1139 | κυρ
1140 | κύρ
1141 | κ
1142 | κ.ά
1143 | κ.ά.π
1144 | κ.α
1145 | κ.εξ
1146 | κ.επ
1147 | κ.ε
1148 | κ.λπ
1149 | κ.λ.π
1150 | κ.ού.κ
1151 | κ.ο.κ
1152 | κ.τ.λ
1153 | κ.τ.τ
1154 | κ.τ.ό
1155 | λέξ
1156 | λαογρ
1157 | λαπ
1158 | λατιν
1159 | λατ
1160 | λαϊκότρ
1161 | λαϊκ
1162 | λετ
1163 | λιθ
1164 | λογιστ
1165 | λογοτ
1166 | λογ
1167 | λουβ
1168 | λυδ
1169 | λόγ
1170 | λ
1171 | λ.χ
1172 | μέλλ
1173 | μέσ
1174 | μαθημ
1175 | μαθ
1176 | μαιευτ
1177 | μαλαισ
1178 | μαλτ
1179 | μαμμων
1180 | μεγεθ
1181 | μεε
1182 | μειωτ
1183 | μελ
1184 | μεξ
1185 | μεσν
1186 | μεσογ
1187 | μεσοπαθ
1188 | μεσοφ
1189 | μετάθ
1190 | μεταβτ
1191 | μεταβ
1192 | μετακ
1193 | μεταπλ
1194 | μεταπτωτ
1195 | μεταρ
1196 | μεταφορ
1197 | μετβ
1198 | μετεπιθ
1199 | μετεπιρρ
1200 | μετεωρολ
1201 | μετεωρ
1202 | μετον
1203 | μετουσ
1204 | μετοχ
1205 | μετρ
1206 | μετ
1207 | μητρων
1208 | μηχανολ
1209 | μηχ
1210 | μικροβιολ
1211 | μογγολ
1212 | μορφολ
1213 | μουσ
1214 | μπενελούξ
1215 | μσνλατ
1216 | μσν
1217 | μτβ
1218 | μτγν
1219 | μτγ
1220 | μτφρδ
1221 | μτφρ
1222 | μτφ
1223 | μτχ
1224 | μυθ
1225 | μυκην
1226 | μυκ
1227 | μφ
1228 | μ
1229 | μ.ε
1230 | μ.μ
1231 | μ.π.ε
1232 | μ.π.π
1233 | μ0
1234 | ναυτ
1235 | νεοελλ
1236 | νεολατιν
1237 | νεολατ
1238 | νεολ
1239 | νεότ
1240 | νλατ
1241 | νομ
1242 | νορβ
1243 | νοσ
1244 | νότ
1245 | ν
1246 | ξ.λ
1247 | οικοδ
1248 | οικολ
1249 | οικον
1250 | οικ
1251 | ολλανδ
1252 | ολλ
1253 | ομηρ
1254 | ομόρρ
1255 | ονομ
1256 | ον
1257 | οπτ
1258 | ορθογρ
1259 | ορθ
1260 | οριστ
1261 | ορυκτολ
1262 | ορυκτ
1263 | ορ
1264 | οσετ
1265 | οσκ
1266 | ουαλ
1267 | ουγγρ
1268 | ουδ
1269 | ουσιαστικοπ
1270 | ουσιαστ
1271 | ουσ
1272 | πίν
1273 | παθητ
1274 | παθολ
1275 | παθ
1276 | παιδ
1277 | παλαιοντ
1278 | παλαιότ
1279 | παλ
1280 | παππων
1281 | παράγρ
1282 | παράγ
1283 | παράλλ
1284 | παράλ
1285 | παραγ
1286 | παρακ
1287 | παραλ
1288 | παραπ
1289 | παρατ
1290 | παρβ
1291 | παρετυμ
1292 | παροξ
1293 | παρων
1294 | παρωχ
1295 | παρ
1296 | παρ.φρ
1297 | πατριδων
1298 | πατρων
1299 | πβ
1300 | περιθ
1301 | περιλ
1302 | περιφρ
1303 | περσ
1304 | περ
1305 | πιθ
1306 | πληθ
1307 | πληροφ
1308 | ποδ
1309 | ποιητ
1310 | πολιτ
1311 | πολλαπλ
1312 | πολ
1313 | πορτογαλ
1314 | πορτ
1315 | ποσ
1316 | πρακριτ
1317 | πρβλ
1318 | πρβ
1319 | πργ
1320 | πρκμ
1321 | πρκ
1322 | πρλ
1323 | προέλ
1324 | προβηγκ
1325 | προελλ
1326 | προηγ
1327 | προθεμ
1328 | προπαραλ
1329 | προπαροξ
1330 | προπερισπ
1331 | προσαρμ
1332 | προσηγορ
1333 | προσταχτ
1334 | προστ
1335 | προσφών
1336 | προσ
1337 | προτακτ
1338 | προτ.Εισ
1339 | προφ
1340 | προχωρ
1341 | πρτ
1342 | πρόθ
1343 | πρόσθ
1344 | πρόσ
1345 | πρότ
1346 | πρ
1347 | πρ.Εφ
1348 | πτ
1349 | πυ
1350 | π
1351 | π.Χ
1352 | π.μ
1353 | π.χ
1354 | ρήμ
1355 | ρίζ
1356 | ρηματ
1357 | ρητορ
1358 | ριν
1359 | ρουμ
1360 | ρωμ
1361 | ρωσ
1362 | ρ
1363 | σανσκρ
1364 | σαξ
1365 | σελ
1366 | σερβοκρ
1367 | σερβ
1368 | σημασιολ
1369 | σημδ
1370 | σημειολ
1371 | σημερ
1372 | σημιτ
1373 | σημ
1374 | σκανδ
1375 | σκυθ
1376 | σκωπτ
1377 | σλαβ
1378 | σλοβ
1379 | σουηδ
1380 | σουμερ
1381 | σουπ
1382 | σπάν
1383 | σπανιότ
1384 | σπ
1385 | σσ
1386 | στατ
1387 | στερ
1388 | στιγμ
1389 | στιχ
1390 | στρέμ
1391 | στρατιωτ
1392 | στρατ
1393 | στ
1394 | συγγ
1395 | συγκρ
1396 | συγκ
1397 | συμπερ
1398 | συμπλεκτ
1399 | συμπλ
1400 | συμπροφ
1401 | συμφυρ
1402 | συμφ
1403 | συνήθ
1404 | συνίζ
1405 | συναίρ
1406 | συναισθ
1407 | συνδετ
1408 | συνδ
1409 | συνεκδ
1410 | συνηρ
1411 | συνθετ
1412 | συνθ
1413 | συνοπτ
1414 | συντελ
1415 | συντομογρ
1416 | συντ
1417 | συν
1418 | συρ
1419 | σχημ
1420 | σχ
1421 | σύγκρ
1422 | σύμπλ
1423 | σύμφ
1424 | σύνδ
1425 | σύνθ
1426 | σύντμ
1427 | σύντ
1428 | σ
1429 | σ.π
1430 | σ/β
1431 | τακτ
1432 | τελ
1433 | τετρ
1434 | τετρ.μ
1435 | τεχνλ
1436 | τεχνολ
1437 | τεχν
1438 | τεύχ
1439 | τηλεπικ
1440 | τηλεόρ
1441 | τιμ
1442 | τιμ.τομ
1443 | τοΣ
1444 | τον
1445 | τοπογρ
1446 | τοπων
1447 | τοπ
1448 | τοσκ
1449 | τουρκ
1450 | τοχ
1451 | τριτοπρόσ
1452 | τροποπ
1453 | τροπ
1454 | τσεχ
1455 | τσιγγ
1456 | ττ
1457 | τυπ
1458 | τόμ
1459 | τόνν
1460 | τ
1461 | τ.μ
1462 | τ.χλμ
1463 | υβρ
1464 | υπερθ
1465 | υπερσ
1466 | υπερ
1467 | υπεύθ
1468 | υποθ
1469 | υποκορ
1470 | υποκ
1471 | υποσημ
1472 | υποτ
1473 | υποφ
1474 | υποχωρ
1475 | υπόλ
1476 | υπόχρ
1477 | υπ
1478 | υστλατ
1479 | υψόμ
1480 | υψ
1481 | φάκ
1482 | φαρμακολ
1483 | φαρμ
1484 | φιλολ
1485 | φιλοσ
1486 | φιλοτ
1487 | φινλ
1488 | φοινικ
1489 | φράγκ
1490 | φρανκον
1491 | φριζ
1492 | φρ
1493 | φυλλ
1494 | φυσιολ
1495 | φυσ
1496 | φωνηεντ
1497 | φωνητ
1498 | φωνολ
1499 | φων
1500 | φωτογρ
1501 | φ
1502 | φ.τ.μ
1503 | χαμιτ
1504 | χαρτόσ
1505 | χαρτ
1506 | χασμ
1507 | χαϊδ
1508 | χγφ
1509 | χειλ
1510 | χεττ
1511 | χημ
1512 | χιλ
1513 | χλγρ
1514 | χλγ
1515 | χλμ
1516 | χλμ.2
1517 | χλμ.3
1518 | χλσγρ
1519 | χλστγρ
1520 | χλστμ
1521 | χλστμ.2
1522 | χλστμ.3
1523 | χλ
1524 | χργρ
1525 | χρημ
1526 | χρον
1527 | χρ
1528 | χφ
1529 | χ.ε
1530 | χ.κ
1531 | χ.ο
1532 | χ.σ
1533 | χ.τ
1534 | χ.χ
1535 | ψευδ
1536 | ψυχαν
1537 | ψυχιατρ
1538 | ψυχολ
1539 | ψυχ
1540 | ωκεαν
1541 | όμ
1542 | όν
1543 | όπ.παρ
1544 | όπ.π
1545 | ό.π
1546 | ύψ
1547 | 1Βσ
1548 | 1Εσ
1549 | 1Θσ
1550 | 1Ιν
1551 | 1Κρ
1552 | 1Μκ
1553 | 1Πρ
1554 | 1Πτ
1555 | 1Τμ
1556 | 2Βσ
1557 | 2Εσ
1558 | 2Θσ
1559 | 2Ιν
1560 | 2Κρ
1561 | 2Μκ
1562 | 2Πρ
1563 | 2Πτ
1564 | 2Τμ
1565 | 3Βσ
1566 | 3Ιν
1567 | 3Μκ
1568 | 4Βσ
1569 | 


--------------------------------------------------------------------------------