├── programs ├── MergeAndSplit.class ├── CleanChineseFile.class ├── LenRatioRemover.class ├── SplitChineseFile.class ├── ChineseSpecialRemover.class ├── SpecialSentRemoverENDE.class ├── CleanChineseFile.java ├── SplitChineseFile.java ├── LenRatioRemover.java ├── MergeAndSplit.java ├── SpecialSentRemoverENDE.java └── ChineseSpecialRemover.java ├── scripts ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ga │ ├── nonbreaking_prefix.sv │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.yue │ ├── nonbreaking_prefix.zh │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.pt │ ├── nonbreaking_prefix.ta │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.lt │ └── nonbreaking_prefix.el ├── deescape-special-chars.perl ├── input-from-sgm.perl ├── shuffle.py ├── normalize-punctuation.perl ├── truecase.perl ├── tokenizeChinese.py └── tokenizer.perl ├── README.md ├── .gitignore ├── bpe ├── generate_vocab.py ├── learn_joint_bpe_and_vocab.py ├── learn_bpe.py └── apply_bpe.py ├── fetch_wmt2018_zhen.sh ├── fetch_wmt2017_ende.sh └── LICENSE /programs/MergeAndSplit.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/MergeAndSplit.class -------------------------------------------------------------------------------- /programs/CleanChineseFile.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/CleanChineseFile.class -------------------------------------------------------------------------------- /programs/LenRatioRemover.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/LenRatioRemover.class -------------------------------------------------------------------------------- /programs/SplitChineseFile.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/SplitChineseFile.class -------------------------------------------------------------------------------- /programs/ChineseSpecialRemover.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/ChineseSpecialRemover.class -------------------------------------------------------------------------------- /programs/SpecialSentRemoverENDE.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhaocq-nlp/MT-data-processing/HEAD/programs/SpecialSentRemoverENDE.class -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MT-data-processing 2 | This repository contains scripts for the shared translation task at the Statistical Machine Translation. 3 | Now, it deals with: 4 | - WMT2017 EN<->DE 5 | 6 | ## References 7 | - [BPE](https://github.com/rsennrich/subword-nmt) 8 | - [mosesdecoder](https://github.com/moses-smt/mosesdecoder) 9 | 10 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.ga: -------------------------------------------------------------------------------- 1 | 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | Á 29 | É 30 | Í 31 | Ó 32 | Ú 33 | 34 | Uacht 35 | Dr 36 | B.Arch 37 | 38 | m.sh 39 | .i 40 | Co 41 | Cf 42 | cf 43 | i.e 44 | r 45 | Chr 46 | lch #NUMERIC_ONLY# 47 | lgh #NUMERIC_ONLY# 48 | uimh #NUMERIC_ONLY# 49 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /scripts/deescape-special-chars.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | while() { 10 | s/\&bar;/\|/g; # factor separator (legacy) 11 | s/\|/\|/g; # factor separator 12 | s/\</\/g; # xml 14 | s/\&bra;/\[/g; # syntax non-terminal (legacy) 15 | s/\&ket;/\]/g; # syntax non-terminal (legacy) 16 | s/\"/\"/g; # xml 17 | s/\'/\'/g; # xml 18 | s/\[/\[/g; # syntax non-terminal 19 | s/\]/\]/g; # syntax non-terminal 20 | s/\&/\&/g; # escape escape 21 | print $_; 22 | } 23 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.yue: -------------------------------------------------------------------------------- 1 | # 2 | # Cantonese (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.zh: -------------------------------------------------------------------------------- 1 | # 2 | # Mandarin (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /scripts/input-from-sgm.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | die("ERROR syntax: input-from-sgm.perl < in.sgm > in.txt") 10 | unless scalar @ARGV == 0; 11 | 12 | while(my $line = ) { 13 | chop($line); 14 | while ($line =~ /]+>\s*$/i) { 15 | my $next_line = ; 16 | $line .= $next_line; 17 | chop($line); 18 | } 19 | while ($line =~ /]+>\s*(.*)\s*$/i && 20 | $line !~ /]+>\s*(.*)\s*<\/seg>/i) { 21 | my $next_line = ; 22 | $line .= $next_line; 23 | chop($line); 24 | } 25 | if ($line =~ /]+>\s*(.*)\s*<\/seg>/i) { 26 | my $input = $1; 27 | $input =~ s/\s+/ /g; 28 | $input =~ s/^ //g; 29 | $input =~ s/ $//g; 30 | print $input."\n"; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /scripts/shuffle.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import numpy 4 | 5 | 6 | def shuffle_data(from_binding, to_binding): 7 | lines_list = [] 8 | fps = [] 9 | fws = [] 10 | for idx in range(len(from_binding)): 11 | lines_list.append([]) 12 | fps.append(open(from_binding[idx], "r")) 13 | 14 | for zip_lines in zip(*fps): 15 | for idx in range(len(zip_lines)): 16 | lines_list[idx].append(zip_lines[idx].strip()) 17 | for fp in fps: 18 | fp.close() 19 | for idx in range(len(to_binding)): 20 | fws.append(open(to_binding[idx], "w")) 21 | rands = numpy.arange(len(lines_list[0])) 22 | numpy.random.shuffle(rands) 23 | for i in rands: 24 | for idx in range(len(lines_list)): 25 | fws[idx].write(lines_list[idx][i] + "\n") 26 | for fw in fws: 27 | fw.close() 28 | 29 | 30 | froms = sys.argv[1] 31 | tos = sys.argv[2] 32 | 33 | shuffle_data(froms.strip().split(","), tos.strip().split(",")) 34 | -------------------------------------------------------------------------------- /programs/CleanChineseFile.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | 4 | public class CleanChineseFile { 5 | public static void main(String[] args) throws Exception { 6 | if (args.length < 2) { 7 | System.out.println("Usage: java SplitChineseFile in out"); 8 | return; 9 | } 10 | String inFile = args[0]; 11 | String outFile = args[1]; 12 | 13 | 14 | BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "utf-8")); 15 | BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), "utf-8")); 16 | String line; 17 | while ((line = br.readLine()) != null) { 18 | String[] tokens = line.trim().split(" +"); 19 | StringBuffer sb = new StringBuffer(); 20 | for(String tok: tokens){ 21 | sb.append(tok +" "); 22 | } 23 | 24 | bw.write(sb.toString().trim() + "\n"); 25 | } 26 | br.close(); 27 | bw.close(); 28 | 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /programs/SplitChineseFile.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | public class SplitChineseFile { 4 | public static void main(String[] args) throws Exception { 5 | if (args.length < 3) { 6 | System.out.println("Usage: java SplitChineseFile mergedfile src trg"); 7 | return; 8 | } 9 | String mergedFile = args[0]; 10 | String srcFile = args[1]; 11 | String trgFile = args[2]; 12 | 13 | 14 | BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mergedFile), "utf-8")); 15 | BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcFile), "utf-8")); 16 | BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgFile), "utf-8")); 17 | String line; 18 | while ((line = br.readLine()) != null) { 19 | String[] tokens = line.trim().split("\t"); 20 | bwSrc.write(tokens[0].trim() + "\n"); 21 | bwTrg.write(tokens[1].trim() + "\n"); 22 | } 23 | 24 | br.close(); 25 | bwSrc.close(); 26 | bwTrg.close(); 27 | 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | #a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /programs/LenRatioRemover.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | public class LenRatioRemover { 4 | 5 | public static void main(String[] args) throws Exception { 6 | if (args.length < 7) { 7 | System.out.println("Usage: java LenRatioRemover src trg src_div_trg_max src_div_trg_min src_output trg_output removed_to_file"); 8 | return; 9 | } 10 | String srcFile = args[0]; 11 | String trgFile = args[1]; 12 | double maxRatio = Double.parseDouble(args[2]); 13 | double minRatio = Double.parseDouble(args[3]); 14 | String srcOutputFile = args[4]; 15 | String trgOutputFile = args[5]; 16 | String removedOutputFile = args[6]; 17 | 18 | BufferedReader brSrc = new BufferedReader(new InputStreamReader(new FileInputStream(srcFile), "utf-8")); 19 | BufferedReader brTrg = new BufferedReader(new InputStreamReader(new FileInputStream(trgFile), "utf-8")); 20 | BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcOutputFile), "utf-8")); 21 | BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgOutputFile), "utf-8")); 22 | BufferedWriter bwRem = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(removedOutputFile), "utf-8")); 23 | 24 | String src, trg; 25 | 26 | while ((src = brSrc.readLine()) != null) { 27 | trg = brTrg.readLine(); 28 | double val = (double) src.trim().split(" ").length / (double) trg.trim().split(" ").length; 29 | if (val < minRatio || val > maxRatio) { 30 | bwRem.write(src + " ||| " + trg + "\n"); 31 | } else { 32 | bwSrc.write(src + "\n"); 33 | bwTrg.write(trg + "\n"); 34 | } 35 | } 36 | 37 | 38 | brSrc.close(); 39 | brTrg.close(); 40 | bwSrc.close(); 41 | bwTrg.close(); 42 | bwRem.close(); 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /programs/MergeAndSplit.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | public class MergeAndSplit { 4 | 5 | public static void main(String[] args) throws Exception { 6 | if (args.length < 4) { 7 | System.out.println("Usage: java MergeAndSplit type src trg merged"); 8 | return; 9 | } 10 | String type = args[0]; 11 | String srcFile = args[1]; 12 | String trgFile = args[2]; 13 | String mergedFile = args[3]; 14 | 15 | if (type.equals("merge")) { 16 | BufferedReader brSrc = new BufferedReader(new InputStreamReader(new FileInputStream(srcFile), "utf-8")); 17 | BufferedReader brTrg = new BufferedReader(new InputStreamReader(new FileInputStream(trgFile), "utf-8")); 18 | BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(mergedFile), "utf-8")); 19 | 20 | String src, trg; 21 | 22 | while ((src = brSrc.readLine()) != null) { 23 | trg = brTrg.readLine(); 24 | bw.write(src + " ||| " + trg + "\n"); 25 | } 26 | 27 | 28 | brSrc.close(); 29 | brTrg.close(); 30 | bw.close(); 31 | } else if (type.equals("split")) { 32 | BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mergedFile), "utf-8")); 33 | BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcFile), "utf-8")); 34 | BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgFile), "utf-8")); 35 | String line; 36 | while ((line = br.readLine()) != null) { 37 | String[] tokens = line.trim().split(" \\|\\|\\| "); 38 | bwSrc.write(tokens[0] + "\n"); 39 | bwTrg.write(tokens[1] + "\n"); 40 | } 41 | 42 | br.close(); 43 | bwSrc.close(); 44 | bwTrg.close(); 45 | } else { 46 | System.out.println("Unrecognized type, which should be merge or split."); 47 | } 48 | 49 | } 50 | } 51 | 52 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /scripts/normalize-punctuation.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | my $language = "en"; 10 | my $PENN = 0; 11 | 12 | while (@ARGV) { 13 | $_ = shift; 14 | /^-b$/ && ($| = 1, next); # not buffered (flush each line) 15 | /^-l$/ && ($language = shift, next); 16 | /^[^\-]/ && ($language = $_, next); 17 | /^-penn$/ && ($PENN = 1, next); 18 | } 19 | 20 | while() { 21 | s/\r//g; 22 | # remove extra spaces 23 | s/\(/ \(/g; 24 | s/\)/\) /g; s/ +/ /g; 25 | s/\) ([\.\!\:\?\;\,])/\)$1/g; 26 | s/\( /\(/g; 27 | s/ \)/\)/g; 28 | s/(\d) \%/$1\%/g; 29 | s/ :/:/g; 30 | s/ ;/;/g; 31 | # normalize unicode punctuation 32 | if ($PENN == 0) { 33 | s/\`/\'/g; 34 | s/\'\'/ \" /g; 35 | } 36 | 37 | s/„/\"/g; 38 | s/“/\"/g; 39 | s/”/\"/g; 40 | s/–/-/g; 41 | s/—/ - /g; s/ +/ /g; 42 | s/´/\'/g; 43 | s/([a-z])‘([a-z])/$1\'$2/gi; 44 | s/([a-z])’([a-z])/$1\'$2/gi; 45 | s/‘/\"/g; 46 | s/‚/\"/g; 47 | s/’/\"/g; 48 | s/''/\"/g; 49 | s/´´/\"/g; 50 | s/…/.../g; 51 | # French quotes 52 | s/ « / \"/g; 53 | s/« /\"/g; 54 | s/«/\"/g; 55 | s/ » /\" /g; 56 | s/ »/\"/g; 57 | s/»/\"/g; 58 | # handle pseudo-spaces 59 | s/ \%/\%/g; 60 | s/nº /nº /g; 61 | s/ :/:/g; 62 | s/ ºC/ ºC/g; 63 | s/ cm/ cm/g; 64 | s/ \?/\?/g; 65 | s/ \!/\!/g; 66 | s/ ;/;/g; 67 | s/, /, /g; s/ +/ /g; 68 | 69 | # English "quotation," followed by comma, style 70 | if ($language eq "en") { 71 | s/\"([,\.]+)/$1\"/g; 72 | } 73 | # Czech is confused 74 | elsif ($language eq "cs" || $language eq "cz") { 75 | } 76 | # German/Spanish/French "quotation", followed by comma, style 77 | else { 78 | s/,\"/\",/g; 79 | s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence 80 | } 81 | 82 | 83 | if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { 84 | s/(\d) (\d)/$1,$2/g; 85 | } 86 | else { 87 | s/(\d) (\d)/$1.$2/g; 88 | } 89 | print $_; 90 | } 91 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /bpe/generate_vocab.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Copyright 2017 Google Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #pylint: disable=invalid-name 17 | """ 18 | Generate vocabulary for a tokenized text file. 19 | """ 20 | 21 | import sys 22 | import argparse 23 | import collections 24 | import logging 25 | 26 | parser = argparse.ArgumentParser( 27 | description="Generate vocabulary for a tokenized text file.") 28 | parser.add_argument( 29 | "--min_frequency", 30 | dest="min_frequency", 31 | type=int, 32 | default=0, 33 | help="Minimum frequency of a word to be included in the vocabulary.") 34 | parser.add_argument( 35 | "--max_vocab_size", 36 | dest="max_vocab_size", 37 | type=int, 38 | help="Maximum number of tokens in the vocabulary") 39 | parser.add_argument( 40 | "--downcase", 41 | dest="downcase", 42 | type=bool, 43 | help="If set to true, downcase all text before processing.", 44 | default=False) 45 | parser.add_argument( 46 | "infile", 47 | nargs="?", 48 | type=argparse.FileType("r"), 49 | default=sys.stdin, 50 | help="Input tokenized text file to be processed.") 51 | parser.add_argument( 52 | "--delimiter", 53 | dest="delimiter", 54 | type=str, 55 | default=" ", 56 | help="Delimiter character for tokenizing. Use \" \" and \"\" for word and char level respectively." 57 | ) 58 | args = parser.parse_args() 59 | 60 | # Counter for all tokens in the vocabulary 61 | cnt = collections.Counter() 62 | 63 | for line in args.infile: 64 | if args.downcase: 65 | line = line.lower() 66 | if args.delimiter == "": 67 | tokens = list(line.strip()) 68 | else: 69 | tokens = line.strip().split(args.delimiter) 70 | tokens = [_ for _ in tokens if len(_) > 0] 71 | cnt.update(tokens) 72 | 73 | logging.info("Found %d unique tokens in the vocabulary.", len(cnt)) 74 | 75 | # Filter tokens below the frequency threshold 76 | if args.min_frequency > 0: 77 | filtered_tokens = [(w, c) for w, c in cnt.most_common() 78 | if c >= args.min_frequency] 79 | cnt = collections.Counter(dict(filtered_tokens)) 80 | 81 | logging.info("Found %d unique tokens with frequency > %d.", 82 | len(cnt), args.min_frequency) 83 | 84 | # Sort tokens by 1. frequency 2. lexically to break ties 85 | word_with_counts = cnt.most_common() 86 | word_with_counts = sorted( 87 | word_with_counts, key=lambda x: (x[1], x[0]), reverse=True) 88 | 89 | # Take only max-vocab 90 | if args.max_vocab_size is not None: 91 | word_with_counts = word_with_counts[:args.max_vocab_size] 92 | 93 | for word, count in word_with_counts: 94 | print("{}\t{}".format(word, count)) 95 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /programs/SpecialSentRemoverENDE.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | public class SpecialSentRemoverENDE { 4 | 5 | public static boolean isNumber(String word) { 6 | try { 7 | Double.parseDouble(word); 8 | } catch (Exception e) { 9 | return false; 10 | } 11 | return true; 12 | } 13 | 14 | public static boolean isCommonWord(String word) { 15 | for (int i = 0; i < word.length(); ++i) { 16 | char c; 17 | if (i == 0) { 18 | c = word.toLowerCase().charAt(0); 19 | } else { 20 | c = word.charAt(i); 21 | } 22 | if (c <= 'z' && c >= 'a') { 23 | continue; 24 | } else { 25 | return false; 26 | } 27 | } 28 | return true; 29 | } 30 | 31 | public static boolean isCapital(String word) { 32 | for (int i = 0; i < word.length(); ++i) { 33 | if (word.charAt(i) >= 'A' && word.charAt(i) <= 'Z') { 34 | continue; 35 | } else { 36 | return false; 37 | } 38 | } 39 | return true; 40 | } 41 | 42 | public static boolean isAlmostNumberCap(String sentence) { 43 | String[] tokens = sentence.trim().split(" "); 44 | int cnt = 0; 45 | for (String word : tokens) { 46 | if (isNumber(word) || isCapital(word)) ++cnt; 47 | } 48 | if ((double) cnt / (double) tokens.length >= 0.5) { 49 | return true; 50 | } 51 | return false; 52 | } 53 | 54 | public static boolean isUglySentence(String sentence) { 55 | String[] tokens = sentence.trim().split(" "); 56 | int cnt = 0; 57 | for (String word : tokens) { 58 | if (!isCommonWord(word)) ++cnt; 59 | } 60 | if ((double) cnt / (double) tokens.length >=0.5) { 61 | return true; 62 | } 63 | return false; 64 | } 65 | 66 | public static void main(String[] args) throws Exception { 67 | if (args.length < 5) { 68 | System.out.println("Usage: java SpecialSentRemoverENDE en de en_output de_output removed_to_file"); 69 | return; 70 | } 71 | String enFile = args[0]; 72 | String deFile = args[1]; 73 | String enOutputFile = args[2]; 74 | String deOutputFile = args[3]; 75 | String removedOutputFile = args[4]; 76 | 77 | BufferedReader brEn = new BufferedReader(new InputStreamReader(new FileInputStream(enFile), "utf-8")); 78 | BufferedReader brDe = new BufferedReader(new InputStreamReader(new FileInputStream(deFile), "utf-8")); 79 | BufferedWriter bwEn = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(enOutputFile), "utf-8")); 80 | BufferedWriter bwDe = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(deOutputFile), "utf-8")); 81 | BufferedWriter bwRem = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(removedOutputFile), "utf-8")); 82 | 83 | String en, de; 84 | 85 | while ((en = brEn.readLine()) != null) { 86 | de = brDe.readLine(); 87 | if (isUglySentence(en) || isAlmostNumberCap(de)) { 88 | bwRem.write(en + " ||| " + de + "\n"); 89 | } else { 90 | bwEn.write(en + "\n"); 91 | bwDe.write(de + "\n"); 92 | } 93 | } 94 | 95 | brEn.close(); 96 | brDe.close(); 97 | bwEn.close(); 98 | bwDe.close(); 99 | bwRem.close(); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /scripts/truecase.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ 7 | 8 | use warnings; 9 | use strict; 10 | use Getopt::Long "GetOptions"; 11 | 12 | binmode(STDIN, ":utf8"); 13 | binmode(STDOUT, ":utf8"); 14 | 15 | # apply switches 16 | # ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known 17 | my ($MODEL, $UNBUFFERED, $ASR); 18 | die("truecase.perl --model MODEL [-b] [-a] < in > out") 19 | unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR) 20 | && defined($MODEL); 21 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } 22 | my $asr = 0; 23 | if (defined($ASR) && $ASR) { $asr = 1; } 24 | 25 | my (%BEST,%KNOWN); 26 | open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); 27 | binmode(MODEL, ":utf8"); 28 | while() { 29 | my ($word,@OPTIONS) = split; 30 | $BEST{ lc($word) } = $word; 31 | if ($asr == 0) { 32 | $KNOWN{ $word } = 1; 33 | for(my $i=1;$i<$#OPTIONS;$i+=2) { 34 | $KNOWN{ $OPTIONS[$i] } = 1; 35 | } 36 | } 37 | } 38 | close(MODEL); 39 | 40 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); 41 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); 42 | 43 | while() { 44 | chop; 45 | my ($WORD,$MARKUP) = split_xml($_); 46 | my $sentence_start = 1; 47 | for(my $i=0;$i<=$#$WORD;$i++) { 48 | print " " if $i && $$MARKUP[$i] eq ''; 49 | print $$MARKUP[$i]; 50 | 51 | my ($word,$otherfactors); 52 | if ($$WORD[$i] =~ /^([^\|]+)(.*)/) 53 | { 54 | $word = $1; 55 | $otherfactors = $2; 56 | } 57 | else 58 | { 59 | $word = $$WORD[$i]; 60 | $otherfactors = ""; 61 | } 62 | if ($asr){ 63 | $word = lc($word); #make sure ASR output is not uc 64 | } 65 | 66 | if ($sentence_start && defined($BEST{lc($word)})) { 67 | print $BEST{lc($word)}; # truecase sentence start 68 | } 69 | elsif (defined($KNOWN{$word})) { 70 | print $word; # don't change known words 71 | } 72 | elsif (defined($BEST{lc($word)})) { 73 | print $BEST{lc($word)}; # truecase otherwise unknown words 74 | } 75 | else { 76 | print $word; # unknown, nothing to do 77 | } 78 | print $otherfactors; 79 | 80 | if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } 81 | elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } 82 | } 83 | print $$MARKUP[$#$MARKUP]; 84 | print "\n"; 85 | } 86 | 87 | # store away xml markup 88 | sub split_xml { 89 | my ($line) = @_; 90 | my (@WORD,@MARKUP); 91 | my $i = 0; 92 | $MARKUP[0] = ""; 93 | while($line =~ /\S/) { 94 | # XML tag 95 | if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { 96 | my $potential_xml = $1; 97 | my $line_next = $2; 98 | # exception for factor that is an XML tag 99 | if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { 100 | $WORD[$i-1] .= $potential_xml; 101 | if ($line_next =~ /^(\|+)(.*)$/) { 102 | $WORD[$i-1] .= $1; 103 | $line_next = $2; 104 | } 105 | } 106 | else { 107 | $MARKUP[$i] .= $potential_xml." "; 108 | } 109 | $line = $line_next; 110 | } 111 | # non-XML text 112 | elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { 113 | $WORD[$i++] = $1; 114 | $MARKUP[$i] = ""; 115 | $line = $2; 116 | } 117 | # '<' or '>' occurs in word, but it's not an XML tag 118 | elsif ($line =~ /^\s*(\S+)(.*)$/) { 119 | $WORD[$i++] = $1; 120 | $MARKUP[$i] = ""; 121 | $line = $2; 122 | } 123 | else { 124 | die("ERROR: huh? $line\n"); 125 | } 126 | } 127 | chop($MARKUP[$#MARKUP]); 128 | return (\@WORD,\@MARKUP); 129 | } 130 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /programs/ChineseSpecialRemover.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | 3 | public class ChineseSpecialRemover { 4 | 5 | public static boolean isCommonWord(String word) { 6 | for (int i = 0; i < word.length(); ++i) { 7 | char c; 8 | if (i == 0) { 9 | c = word.toLowerCase().charAt(0); 10 | } else { 11 | c = word.charAt(i); 12 | } 13 | if (c <= 'z' && c >= 'a') { 14 | continue; 15 | } else { 16 | return false; 17 | } 18 | } 19 | return true; 20 | } 21 | 22 | public static boolean isNumber(String word) { 23 | try { 24 | Double.parseDouble(word); 25 | } catch (Exception e) { 26 | return false; 27 | } 28 | return true; 29 | } 30 | 31 | public static boolean isUglySentence(String sentence) { 32 | String[] tokens = sentence.trim().split(" "); 33 | int cnt = 0; 34 | for (String word : tokens) { 35 | if (!isCommonWord(word)) ++cnt; 36 | } 37 | if ((double) cnt / (double) tokens.length >=0.5) { 38 | return true; 39 | } 40 | return false; 41 | } 42 | 43 | public static boolean isAlmostAscii(String sentence) { 44 | String[] tokens = sentence.trim().split(" "); 45 | int cnt = 0; 46 | for (String word : tokens) { 47 | if (isNumber(word) || isCommonWord(word)) ++cnt; 48 | } 49 | if ((double) cnt / (double) tokens.length >= 0.4) { 50 | return true; 51 | } 52 | return false; 53 | } 54 | 55 | public static void main(String[] args) throws Exception { 56 | if (args.length < 8) { 57 | System.out.println("Usage: java LenRatioRemover zh zh_char en zhchar_div_en_max zhchar_div_en_min zh_output en_output removed_to_file"); 58 | return; 59 | } 60 | String srcFile = args[0]; 61 | String srcCharFile = args[1]; 62 | String trgFile = args[2]; 63 | double maxRatio = Double.parseDouble(args[3]); 64 | double minRatio = Double.parseDouble(args[4]); 65 | String srcOutputFile = args[5]; 66 | String trgOutputFile = args[6]; 67 | String removedOutputFile = args[7]; 68 | 69 | BufferedReader brSrc = new BufferedReader(new InputStreamReader(new FileInputStream(srcFile), "utf-8")); 70 | BufferedReader brSrcChar = new BufferedReader(new InputStreamReader(new FileInputStream(srcCharFile), "utf-8")); 71 | BufferedReader brTrg = new BufferedReader(new InputStreamReader(new FileInputStream(trgFile), "utf-8")); 72 | BufferedWriter bwSrc = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(srcOutputFile), "utf-8")); 73 | BufferedWriter bwTrg = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(trgOutputFile), "utf-8")); 74 | BufferedWriter bwRem = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(removedOutputFile), "utf-8")); 75 | 76 | String zh, zhChar, en; 77 | 78 | while ((zh = brSrc.readLine()) != null) { 79 | zhChar = brSrcChar.readLine(); 80 | en = brTrg.readLine(); 81 | double val = (double) zhChar.trim().split(" ").length / (double) en.trim().split(" ").length; 82 | if (val <= minRatio || val >= maxRatio) { 83 | bwRem.write("len" + " ||| " + zh + " ||| " + en + "\n"); 84 | } else { 85 | if(isUglySentence(en) || isAlmostAscii(zh)){ 86 | bwRem.write("ascii" + " ||| " + zh + " ||| " + en + "\n"); 87 | }else { 88 | bwSrc.write(zh + "\n"); 89 | bwTrg.write(en + "\n"); 90 | } 91 | } 92 | } 93 | 94 | 95 | brSrc.close(); 96 | brSrcChar.close(); 97 | brTrg.close(); 98 | bwSrc.close(); 99 | bwTrg.close(); 100 | bwRem.close(); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /bpe/learn_joint_bpe_and_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus, 7 | applies the learned operation to each and (optionally) returns the resulting vocabulary of each text. 8 | The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text. 9 | 10 | Reference: 11 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 12 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 13 | """ 14 | 15 | from __future__ import unicode_literals 16 | 17 | import sys 18 | import os 19 | import codecs 20 | import argparse 21 | import tempfile 22 | from collections import Counter 23 | 24 | import learn_bpe 25 | import apply_bpe 26 | 27 | # hack for python2/3 compatibility 28 | from io import open 29 | argparse.open = open 30 | 31 | def create_parser(): 32 | parser = argparse.ArgumentParser( 33 | formatter_class=argparse.RawDescriptionHelpFormatter, 34 | description="learn BPE-based word segmentation") 35 | 36 | parser.add_argument( 37 | '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+', 38 | metavar='PATH', 39 | help="Input texts (multiple allowed).") 40 | parser.add_argument( 41 | '--output', '-o', type=argparse.FileType('w'), required=True, 42 | metavar='PATH', 43 | help="Output file for BPE codes.") 44 | parser.add_argument( 45 | '--symbols', '-s', type=int, default=10000, 46 | help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))") 47 | parser.add_argument( 48 | '--separator', type=str, default='@@', metavar='STR', 49 | help="Separator between non-final subword units (default: '%(default)s'))") 50 | parser.add_argument( 51 | '--write-vocabulary', type=argparse.FileType('w'), nargs = '+', default=None, 52 | metavar='PATH', dest='vocab', 53 | help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py') 54 | parser.add_argument( 55 | '--min-frequency', type=int, default=2, metavar='FREQ', 56 | help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') 57 | parser.add_argument( 58 | '--verbose', '-v', action="store_true", 59 | help="verbose mode.") 60 | 61 | return parser 62 | 63 | 64 | 65 | if __name__ == '__main__': 66 | 67 | # python 2/3 compatibility 68 | if sys.version_info < (3, 0): 69 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 70 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 71 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 72 | else: 73 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) 74 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) 75 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) 76 | 77 | parser = create_parser() 78 | args = parser.parse_args() 79 | 80 | if args.vocab and len(args.input) != len(args.vocab): 81 | sys.stderr.write('Error: number of input files and vocabulary files must match\n') 82 | sys.exit(1) 83 | 84 | # read/write files as UTF-8 85 | args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] 86 | args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] 87 | 88 | # get combined vocabulary of all input texts 89 | full_vocab = Counter() 90 | for f in args.input: 91 | full_vocab += learn_bpe.get_vocabulary(f) 92 | f.seek(0) 93 | 94 | vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] 95 | 96 | # learn BPE on combined vocabulary 97 | with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: 98 | learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) 99 | 100 | with codecs.open(args.output.name, encoding='UTF-8') as codes: 101 | bpe = apply_bpe.BPE(codes, separator=args.separator) 102 | 103 | # apply BPE to each training corpus and get vocabulary 104 | for train_file, vocab_file in zip(args.input, args.vocab): 105 | 106 | tmp = tempfile.NamedTemporaryFile(delete=False) 107 | tmp.close() 108 | 109 | tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') 110 | 111 | train_file.seek(0) 112 | for line in train_file: 113 | tmpout.write(bpe.segment(line).strip()) 114 | tmpout.write('\n') 115 | 116 | tmpout.close() 117 | tmpin = codecs.open(tmp.name, encoding='UTF-8') 118 | 119 | vocab = learn_bpe.get_vocabulary(tmpin) 120 | tmpin.close() 121 | os.remove(tmp.name) 122 | 123 | for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): 124 | vocab_file.write("{0} {1}\n".format(key, freq)) 125 | vocab_file.close() 126 | -------------------------------------------------------------------------------- /fetch_wmt2018_zhen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2017 Natural Language Processing Group, Nanjing University, zhaocq.nlp@gmail.com. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | REPO_DIR=. 19 | 20 | OUTPUT_DIR="${1:-wmt18_zh_en}" 21 | 22 | MERGE_OPS=60000 23 | BPE_THRESHOLD=50 24 | 25 | echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable." 26 | 27 | OUTPUT_DIR_DATA="${OUTPUT_DIR}/data" 28 | mkdir -p $OUTPUT_DIR_DATA 29 | 30 | echo "Downloading preprocessed data. This may take a while..." 31 | curl -o ${OUTPUT_DIR_DATA}/corpus.gz \ 32 | http://data.statmt.org/wmt18/translation-task/preprocessed/zh-en/corpus.gz 33 | 34 | echo "Downloading preprocessed dev data..." 35 | curl -o ${OUTPUT_DIR_DATA}/dev.tgz \ 36 | http://data.statmt.org/wmt18/translation-task/preprocessed/zh-en/dev.tgz 37 | 38 | echo "Downloading test data..." 39 | # curl -o ${OUTPUT_DIR_DATA}/test.tgz \ 40 | # http://data.statmt.org/wmt18/translation-task/test.tgz 41 | 42 | echo "Extracting all files..." 43 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.de.gz 44 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.en.gz 45 | mkdir -p "${OUTPUT_DIR_DATA}/dev" 46 | tar -zxvf ${OUTPUT_DIR_DATA}/dev.tgz -C "${OUTPUT_DIR_DATA}/dev" 47 | #tar -zxvf ${OUTPUT_DIR_DATA}/test.tgz -C "${OUTPUT_DIR_DATA}/" 48 | 49 | mkdir ${OUTPUT_DIR}/dev 50 | cp ${OUTPUT_DIR_DATA}/dev/newsdev2017-zhen* ${OUTPUT_DIR}/dev/ 51 | cp ${OUTPUT_DIR_DATA}/dev/newsdev2017-enzh* ${OUTPUT_DIR}/dev/ 52 | cp ${OUTPUT_DIR_DATA}/dev/newstest2017-zhen* ${OUTPUT_DIR}/dev/ 53 | cp ${OUTPUT_DIR_DATA}/dev/newstest2017-enzh* ${OUTPUT_DIR}/dev/ 54 | 55 | cp ${REPO_DIR}/programs/SplitChineseFile.class . 56 | java SplitChineseFile ${OUTPUT_DIR_DATA}/corpus ${OUTPUT_DIR_DATA}/corpus.zh ${OUTPUT_DIR_DATA}/corpus.en 57 | rm ./SplitChineseFile.class 58 | 59 | # recover special fields 60 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/corpus.en > ${OUTPUT_DIR}/train.tok.tc.en 61 | sed 's/& amp ;/\&/g' ${OUTPUT_DIR}/train.tok.tc.en > ${OUTPUT_DIR}/train.tok.tc.en.tmp 62 | sed 's/& lt ;/\ ${OUTPUT_DIR}/train.tok.tc.en.tmptmp 63 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp 64 | sed 's/& gt ;/\>/g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp 65 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp 66 | sed 's/& quot ;/\"/g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp 67 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp 68 | sed "s/& apos ; s /\'s /g" ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp 69 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp 70 | sed "s/& apos ;/\'/g" ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en.tmptmp 71 | mv ${OUTPUT_DIR}/train.tok.tc.en.tmptmp ${OUTPUT_DIR}/train.tok.tc.en.tmp 72 | sed 's/& amp ;/\&/g' ${OUTPUT_DIR}/train.tok.tc.en.tmp > ${OUTPUT_DIR}/train.tok.tc.en 73 | rm ${OUTPUT_DIR}/train.tok.tc.en.tmp 74 | 75 | # use newsdev2017 as dev set 76 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newsdev2017.tc.en > ${OUTPUT_DIR}/newsdev2017.tok.tc.en 77 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newstest2017.tc.en > ${OUTPUT_DIR}/newstest2017.tok.tc.en 78 | 79 | cp ${REPO_DIR}/programs/CleanChineseFile.class . 80 | java CleanChineseFile ${OUTPUT_DIR_DATA}/corpus.zh ${OUTPUT_DIR}/train.tok.zh 81 | java CleanChineseFile ${OUTPUT_DIR_DATA}/dev/newsdev2017.tc.zh ${OUTPUT_DIR}/newsdev2017.tok.zh 82 | java CleanChineseFile ${OUTPUT_DIR_DATA}/dev/newstest2017.tc.zh ${OUTPUT_DIR}/newstest2017.tok.zh 83 | rm ./CleanChineseFile.class 84 | 85 | rm ${OUTPUT_DIR_DATA}/corpus* 86 | python ${REPO_DIR}/scripts/tokenizeChinese.py ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.zh.char 87 | 88 | echo "Removing special sentences..." 89 | cp ${REPO_DIR}/programs/ChineseSpecialRemover.class . 90 | java ChineseSpecialRemover ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.zh.char ${OUTPUT_DIR}/train.tok.tc.en 3.0 0.7 ${OUTPUT_DIR}/train.tok.zh.rm ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR_DATA}/train.special.removed 91 | rm ./ChineseSpecialRemover.class ${OUTPUT_DIR}/train.tok.zh.char 92 | mv ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR_DATA}/train.tok.zh 93 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en 94 | mv ${OUTPUT_DIR}/train.tok.zh.rm ${OUTPUT_DIR}/train.tok.zh 95 | mv ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.en 96 | 97 | # merge 98 | cp ${REPO_DIR}/programs/MergeAndSplit.class ./ 99 | java MergeAndSplit merge ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/merged 100 | echo "Sorting and removing duplicated sentences..." 101 | sort -u ${OUTPUT_DIR}/merged > ${OUTPUT_DIR}/merged.sort 102 | 103 | java MergeAndSplit split ${OUTPUT_DIR}/train.tok.zh ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/merged.sort 104 | rm ${OUTPUT_DIR}/merged.sort ./MergeAndSplit.class ${OUTPUT_DIR}/merged 105 | 106 | # the files are already cleaned, we only need to learn BPE 107 | echo "Learning BPE with merge_ops=${MERGE_OPS}. This may take a while..." 108 | ${REPO_DIR}/bpe/learn_joint_bpe_and_vocab.py -i ${OUTPUT_DIR}/train.tok.zh \ 109 | --write-vocabulary ${OUTPUT_DIR}/vocab.zh -s ${MERGE_OPS} -o ${OUTPUT_DIR}/bpe.${MERGE_OPS}.zh 110 | ${REPO_DIR}/bpe/learn_joint_bpe_and_vocab.py -i ${OUTPUT_DIR}/train.tok.tc.en \ 111 | --write-vocabulary ${OUTPUT_DIR}/vocab.en -s ${MERGE_OPS} -o ${OUTPUT_DIR}/bpe.${MERGE_OPS}.en 112 | 113 | echo "Apply bpe..." 114 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS}.en --vocabulary ${OUTPUT_DIR}/vocab.en --vocabulary-threshold ${BPE_THRESHOLD} \ 115 | --input ${OUTPUT_DIR}/train.tok.tc.en --output ${OUTPUT_DIR}/train.tok.tc.bpe90k.en 116 | 117 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS}.zh --vocabulary ${OUTPUT_DIR}/vocab.zh --vocabulary-threshold ${BPE_THRESHOLD} \ 118 | --input ${OUTPUT_DIR}/train.tok.zh --output ${OUTPUT_DIR}/train.tok.bpe90k.zh 119 | 120 | echo "Generate vocabulary..." 121 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.bpe60k.zh > ${OUTPUT_DIR}/vocab.bpe60k.all.zh 122 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.tc.bpe60k.en > ${OUTPUT_DIR}/vocab.bpe60k.all.en 123 | 124 | echo "shuffling data..." 125 | python ${REPO_DIR}/scripts/shuffle.py ${OUTPUT_DIR}/train.tok.zh,${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.zh.shuf,${OUTPUT_DIR}/train.tok.tc.en.shuf 126 | 127 | rm -r ${OUTPUT_DIR_DATA} 128 | 129 | echo "All done." -------------------------------------------------------------------------------- /scripts/tokenizeChinese.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Natural Language Processing Group, Nanjing University, zhaocq.nlp@gmail.com. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ The tokenization of Chinese text contains two steps: separate each Chinese characters (by utf-8 encoding); tokenize the non Chinese part (following the mteval script). 15 | Refer to https://github.com/NJUNLP/ZhTokenizer 16 | """ 17 | import re 18 | import sys 19 | import codecs 20 | import six 21 | 22 | 23 | def is_chinese_char(uchar): 24 | """ Whether is a chinese character. 25 | 26 | Args: 27 | uchar: A utf-8 char. 28 | 29 | Returns: True/False. 30 | """ 31 | if uchar >= u'\u3400' and uchar <= u'\u4db5': # CJK Unified Ideographs Extension A, release 3.0 32 | return True 33 | elif uchar >= u'\u4e00' and uchar <= u'\u9fa5': # CJK Unified Ideographs, release 1.1 34 | return True 35 | elif uchar >= u'\u9fa6' and uchar <= u'\u9fbb': # CJK Unified Ideographs, release 4.1 36 | return True 37 | elif uchar >= u'\uf900' and uchar <= u'\ufa2d': # CJK Compatibility Ideographs, release 1.1 38 | return True 39 | elif uchar >= u'\ufa30' and uchar <= u'\ufa6a': # CJK Compatibility Ideographs, release 3.2 40 | return True 41 | elif uchar >= u'\ufa70' and uchar <= u'\ufad9': # CJK Compatibility Ideographs, release 4.1 42 | return True 43 | elif uchar >= u'\u20000' and uchar <= u'\u2a6d6': # CJK Unified Ideographs Extension B, release 3.1 44 | return True 45 | elif uchar >= u'\u2f800' and uchar <= u'\u2fa1d': # CJK Compatibility Supplement, release 3.1 46 | return True 47 | elif uchar >= u'\uff00' and uchar <= u'\uffef': # Full width ASCII, full width of English punctuation, half width Katakana, half wide half width kana, Korean alphabet 48 | return True 49 | elif uchar >= u'\u2e80' and uchar <= u'\u2eff': # CJK Radicals Supplement 50 | return True 51 | elif uchar >= u'\u3000' and uchar <= u'\u303f': # CJK punctuation mark 52 | return True 53 | elif uchar >= u'\u31c0' and uchar <= u'\u31ef': # CJK stroke 54 | return True 55 | elif uchar >= u'\u2f00' and uchar <= u'\u2fdf': # Kangxi Radicals 56 | return True 57 | elif uchar >= u'\u2ff0' and uchar <= u'\u2fff': # Chinese character structure 58 | return True 59 | elif uchar >= u'\u3100' and uchar <= u'\u312f': # Phonetic symbols 60 | return True 61 | elif uchar >= u'\u31a0' and uchar <= u'\u31bf': # Phonetic symbols (Taiwanese and Hakka expansion) 62 | return True 63 | elif uchar >= u'\ufe10' and uchar <= u'\ufe1f': 64 | return True 65 | elif uchar >= u'\ufe30' and uchar <= u'\ufe4f': 66 | return True 67 | elif uchar >= u'\u2600' and uchar <= u'\u26ff': 68 | return True 69 | elif uchar >= u'\u2700' and uchar <= u'\u27bf': 70 | return True 71 | elif uchar >= u'\u3200' and uchar <= u'\u32ff': 72 | return True 73 | elif uchar >= u'\u3300' and uchar <= u'\u33ff': 74 | return True 75 | else: 76 | return False 77 | 78 | 79 | def to_chinese_char(sentences): 80 | """ Converts a Chinese sentence to character level. 81 | 82 | Args: 83 | sentences: A utf-8 string or a list of utf-8 strings. 84 | 85 | Returns: A utf-8 string or a list of utf-8 strings. 86 | """ 87 | 88 | def process(sentence): 89 | sentence = sentence.strip() 90 | 91 | sentence_in_chars = "" 92 | for c in sentence: 93 | if is_chinese_char(c): 94 | sentence_in_chars += " " 95 | sentence_in_chars += c 96 | sentence_in_chars += " " 97 | else: 98 | sentence_in_chars += c 99 | sentence = sentence_in_chars 100 | 101 | # tokenize punctuation 102 | sentence = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', r' \1 ', sentence) 103 | 104 | # tokenize period and comma unless preceded by a digit 105 | sentence = re.sub(r'([^0-9])([\.,])', r'\1 \2 ', sentence) 106 | 107 | # tokenize period and comma unless followed by a digit 108 | sentence = re.sub(r'([\.,])([^0-9])', r' \1 \2', sentence) 109 | 110 | # tokenize dash when preceded by a digit 111 | sentence = re.sub(r'([0-9])(-)', r'\1 \2 ', sentence) 112 | 113 | # one space only between words 114 | sentence = re.sub(r'\s+', r' ', sentence) 115 | 116 | # no leading space 117 | sentence = re.sub(r'^\s+', r'', sentence) 118 | 119 | # no trailing space 120 | sentence = re.sub(r'\s+$', r'', sentence) 121 | return sentence 122 | 123 | if isinstance(sentences, list): 124 | return [process(s) for s in sentences] 125 | elif isinstance(sentences, six.string_types): 126 | return process(sentences) 127 | else: 128 | raise ValueError 129 | 130 | 131 | def tokenize_sgm_file(input_xml_file, output_xml_file): 132 | """ Converts Chinese sentence from input file to output file (XML file). 133 | 134 | Args: 135 | input_xml_file: A string. 136 | output_xml_file: A string. 137 | """ 138 | file_r = codecs.open(input_xml_file, 'r', encoding="utf-8") # input file 139 | file_w = codecs.open(output_xml_file, 'w', encoding="utf-8") # result file 140 | 141 | for sentence in file_r: 142 | if sentence.startswith("") + 1 144 | end = sentence.rfind("<") 145 | new_sentence = sentence[:start] + to_chinese_char(sentence[start:end]) + sentence[end:] 146 | else: 147 | new_sentence = sentence 148 | file_w.write(new_sentence) 149 | 150 | file_r.close() 151 | file_w.close() 152 | 153 | 154 | def tokenize_plain_file(input_file, output_file): 155 | """ Converts Chinese sentence from input file to output file (plain text file). 156 | 157 | Args: 158 | input_file: A string. 159 | output_file: A string. 160 | """ 161 | file_r = codecs.open(input_file, 'r', encoding="utf-8") # input file 162 | file_w = codecs.open(output_file, 'w', encoding="utf-8") # result file 163 | 164 | for sentence in file_r: 165 | file_w.write(to_chinese_char(sentence) + "\n") 166 | 167 | file_r.close() 168 | file_w.close() 169 | 170 | 171 | if __name__ == '__main__': 172 | if sys.argv[1].endswith(".sgm"): 173 | tokenize_sgm_file(sys.argv[1], sys.argv[2]) 174 | else: 175 | tokenize_plain_file(sys.argv[1], sys.argv[2]) 176 | -------------------------------------------------------------------------------- /fetch_wmt2017_ende.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2017 Natural Language Processing Group, Nanjing University, zhaocq.nlp@gmail.com. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | REPO_DIR=. 19 | 20 | OUTPUT_DIR="${1:-wmt17_de_en}" 21 | 22 | MERGE_OPS=90000 23 | BPE_THRESHOLD=50 24 | 25 | echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable." 26 | 27 | OUTPUT_DIR_DATA="${OUTPUT_DIR}/data" 28 | mkdir -p $OUTPUT_DIR_DATA 29 | mkdir ${OUTPUT_DIR}/dev 30 | mkdir ${OUTPUT_DIR}/test 31 | 32 | echo "Downloading preprocessed data. This may take a while..." 33 | 34 | curl -o ${OUTPUT_DIR_DATA}/corpus.tc.de.gz \ 35 | http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.de.gz 36 | 37 | curl -o ${OUTPUT_DIR_DATA}/corpus.tc.en.gz \ 38 | http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.en.gz 39 | 40 | echo "Downloading preprocessed dev data..." 41 | curl -o ${OUTPUT_DIR_DATA}/dev.tgz \ 42 | http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/dev.tgz 43 | 44 | echo "Downloading test data..." 45 | curl -o ${OUTPUT_DIR_DATA}/test.tgz \ 46 | http://data.statmt.org/wmt17/translation-task/test.tgz 47 | 48 | echo "Downloading truecase model..." 49 | curl -o ${OUTPUT_DIR_DATA}/true.tgz \ 50 | http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/true.tgz 51 | 52 | echo "Extracting all files..." 53 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.de.gz 54 | gzip -d ${OUTPUT_DIR_DATA}/corpus.tc.en.gz 55 | mkdir -p "${OUTPUT_DIR_DATA}/dev" 56 | tar -zxvf ${OUTPUT_DIR_DATA}/dev.tgz -C "${OUTPUT_DIR_DATA}/dev" 57 | tar -zxvf ${OUTPUT_DIR_DATA}/test.tgz -C "${OUTPUT_DIR_DATA}/" 58 | tar -zxvf ${OUTPUT_DIR_DATA}/true.tgz -C "${OUTPUT_DIR_DATA}/" 59 | 60 | # recover special fields 61 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/corpus.tc.de > ${OUTPUT_DIR}/train.tok.tc.de 62 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/corpus.tc.en > ${OUTPUT_DIR}/train.tok.tc.en 63 | 64 | # use newstest2016 as dev set 65 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newstest2016.tc.en > ${OUTPUT_DIR}/dev.tok.tc.en 66 | perl ${REPO_DIR}/scripts/deescape-special-chars.perl < ${OUTPUT_DIR_DATA}/dev/newstest2016.tc.de > ${OUTPUT_DIR}/dev.tok.tc.de 67 | 68 | # Convert newstest2017 data into raw text format 69 | ${REPO_DIR}/scripts/input-from-sgm.perl \ 70 | < ${OUTPUT_DIR_DATA}/test/newstest2017-deen-src.de.sgm \ 71 | > ${OUTPUT_DIR_DATA}/test/newstest2017.deen.de 72 | ${REPO_DIR}/scripts/input-from-sgm.perl \ 73 | < ${OUTPUT_DIR_DATA}/test/newstest2017-deen-ref.en.sgm \ 74 | > ${OUTPUT_DIR_DATA}/test/newstest2017.deen.en 75 | ${REPO_DIR}/scripts/input-from-sgm.perl \ 76 | < ${OUTPUT_DIR_DATA}/test/newstest2017-ende-src.en.sgm \ 77 | > ${OUTPUT_DIR_DATA}/test/newstest2017.ende.en 78 | ${REPO_DIR}/scripts/input-from-sgm.perl \ 79 | < ${OUTPUT_DIR_DATA}/test/newstest2017-ende-ref.de.sgm \ 80 | > ${OUTPUT_DIR_DATA}/test/newstest2017.ende.de 81 | 82 | cp ${OUTPUT_DIR_DATA}/dev/newstest2016-deen* ${OUTPUT_DIR}/dev/ 83 | cp ${OUTPUT_DIR_DATA}/dev/newstest2016-ende* ${OUTPUT_DIR}/dev/ 84 | cp ${OUTPUT_DIR_DATA}/test/newstest2017-deen* ${OUTPUT_DIR}/test/ 85 | cp ${OUTPUT_DIR_DATA}/test/newstest2017-ende* ${OUTPUT_DIR}/test/ 86 | 87 | # tokenize 88 | echo "Tokenize..." 89 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.deen.de | \ 90 | ${REPO_DIR}/scripts/normalize-punctuation.perl -l de | \ 91 | ${REPO_DIR}/scripts/tokenizer.perl -a -q -l de -no-escape | \ 92 | ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.de > ${OUTPUT_DIR}/newstest2017.deen.tok.tc.de 93 | 94 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.deen.en | \ 95 | ${REPO_DIR}/scripts/normalize-punctuation.perl -l en | \ 96 | ${REPO_DIR}/scripts/tokenizer.perl -a -q -l en -no-escape | \ 97 | ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.en > ${OUTPUT_DIR}/newstest2017.deen.tok.tc.en 98 | 99 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.ende.de | \ 100 | ${REPO_DIR}/scripts/normalize-punctuation.perl -l de | \ 101 | ${REPO_DIR}/scripts/tokenizer.perl -a -q -l de -no-escape | \ 102 | ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.de > ${OUTPUT_DIR}/newstest2017.ende.tok.tc.de 103 | 104 | cat ${OUTPUT_DIR_DATA}/test/newstest2017.ende.en | \ 105 | ${REPO_DIR}/scripts/normalize-punctuation.perl -l en | \ 106 | ${REPO_DIR}/scripts/tokenizer.perl -a -q -l en -no-escape | \ 107 | ${REPO_DIR}/scripts/truecase.perl -model ${OUTPUT_DIR_DATA}/truecase-model.en > ${OUTPUT_DIR}/newstest2017.ende.tok.tc.en 108 | 109 | # filter by length ratio 110 | echo "Filtering by sentence length ratio..." 111 | cp ${REPO_DIR}/programs/LenRatioRemover.class . 112 | java LenRatioRemover ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de 2.0 0.4 ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR_DATA}/train.lenratio.removed 113 | mv ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR_DATA}/train.tok.tc.de 114 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en 115 | mv ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR}/train.tok.tc.de 116 | mv ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.en 117 | rm ./LenRatioRemover.class 118 | 119 | # filter ugly sentences 120 | echo "Filtering ugly sentences..." 121 | cp ${REPO_DIR}/programs/SpecialSentRemoverENDE.class . 122 | java SpecialSentRemoverENDE ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR_DATA}/train.special.removed 123 | mv ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR_DATA}/train.tok.tc.de.lenrm 124 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en.lenrm 125 | mv ${OUTPUT_DIR}/train.tok.tc.de.rm ${OUTPUT_DIR}/train.tok.tc.de 126 | mv ${OUTPUT_DIR}/train.tok.tc.en.rm ${OUTPUT_DIR}/train.tok.tc.en 127 | rm ./SpecialSentRemoverENDE.class 128 | 129 | cp ${REPO_DIR}/programs/MergeAndSplit.class ./ 130 | java MergeAndSplit merge ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/merged 131 | mv ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR_DATA}/train.tok.tc.de.sprm 132 | mv ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR_DATA}/train.tok.tc.en.sprm 133 | echo "Sorting and removing duplicated sentences..." 134 | sort -u ${OUTPUT_DIR}/merged > ${OUTPUT_DIR}/merged.sort 135 | 136 | java MergeAndSplit split ${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/merged.sort 137 | rm ${OUTPUT_DIR}/merged.sort ./MergeAndSplit.class ${OUTPUT_DIR}/merged 138 | 139 | # the files are already cleaned, we only need to learn BPE 140 | echo "Learning BPE with merge_ops=${MERGE_OPS}. This may take a while..." 141 | ${REPO_DIR}/bpe/learn_joint_bpe_and_vocab.py -i ${OUTPUT_DIR}/train.tok.tc.de ${OUTPUT_DIR}/train.tok.tc.en \ 142 | --write-vocabulary ${OUTPUT_DIR}/vocab.de ${OUTPUT_DIR}/vocab.en -s ${MERGE_OPS} -o ${OUTPUT_DIR}/bpe.${MERGE_OPS} 143 | 144 | echo "Apply bpe..." 145 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS} --vocabulary ${OUTPUT_DIR}/vocab.de --vocabulary-threshold ${BPE_THRESHOLD} \ 146 | --input ${OUTPUT_DIR}/train.tok.tc.de --output ${OUTPUT_DIR}/train.tok.tc.bpe90k.de 147 | 148 | python ${REPO_DIR}/bpe/apply_bpe.py -c ${OUTPUT_DIR}/bpe.${MERGE_OPS} --vocabulary ${OUTPUT_DIR}/vocab.en --vocabulary-threshold ${BPE_THRESHOLD} \ 149 | --input ${OUTPUT_DIR}/train.tok.tc.en --output ${OUTPUT_DIR}/train.tok.tc.bpe90k.en 150 | 151 | echo "Generate vocabulary..." 152 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.tc.bpe90k.de > ${OUTPUT_DIR}/vocab.bpe90k.all.de 153 | python ${REPO_DIR}/bpe/generate_vocab.py ${OUTPUT_DIR}/train.tok.tc.bpe90k.en > ${OUTPUT_DIR}/vocab.bpe90k.all.en 154 | 155 | echo "shuffling data..." 156 | python ${REPO_DIR}/scripts/shuffle.py ${OUTPUT_DIR}/train.tok.tc.de,${OUTPUT_DIR}/train.tok.tc.en ${OUTPUT_DIR}/train.tok.tc.de.shuf,${OUTPUT_DIR}/train.tok.tc.en.shuf 157 | 158 | rm -r ${OUTPUT_DIR_DATA} 159 | 160 | echo "All done." -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.lt: -------------------------------------------------------------------------------- 1 | # Anything in this file, followed by a period (and an upper-case word), 2 | # does NOT indicate an end-of-sentence marker. 3 | # Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | # Any single upper case letter followed by a period is not a sentence ender 6 | # (excluding I occasionally, but we leave it in) 7 | # usually upper case letters are initials in a name 8 | A 9 | Ā 10 | B 11 | C 12 | Č 13 | D 14 | E 15 | Ē 16 | F 17 | G 18 | Ģ 19 | H 20 | I 21 | Ī 22 | J 23 | K 24 | Ķ 25 | L 26 | Ļ 27 | M 28 | N 29 | Ņ 30 | O 31 | P 32 | Q 33 | R 34 | S 35 | Š 36 | T 37 | U 38 | Ū 39 | V 40 | W 41 | X 42 | Y 43 | Z 44 | Ž 45 | 46 | # Initialis -- Džonas 47 | Dz 48 | Dž 49 | Just 50 | 51 | # Day and month abbreviations 52 | # m. menesis d. diena g. gimes 53 | m 54 | mėn 55 | d 56 | g 57 | gim 58 | # Pirmadienis Penktadienis 59 | Pr 60 | Pn 61 | Pirm 62 | Antr 63 | Treč 64 | Ketv 65 | Penkt 66 | Šešt 67 | Sekm 68 | Saus 69 | Vas 70 | Kov 71 | Bal 72 | Geg 73 | Birž 74 | Liep 75 | Rugpj 76 | Rugs 77 | Spal 78 | Lapkr 79 | Gruod 80 | 81 | # Business, governmental, geographical terms 82 | a 83 | # aikštė 84 | adv 85 | # advokatas 86 | akad 87 | # akademikas 88 | aklg 89 | # akligatvis 90 | akt 91 | # aktorius 92 | al 93 | # alėja 94 | A.V 95 | # antspaudo vieta 96 | aps 97 | apskr 98 | # apskritis 99 | apyg 100 | # apygarda 101 | aps 102 | apskr 103 | # apskritis 104 | asist 105 | # asistentas 106 | asmv 107 | avd 108 | # asmenvardis 109 | a.k 110 | asm 111 | asm.k 112 | # asmens kodas 113 | atsak 114 | # atsakingasis 115 | atsisk 116 | sąsk 117 | # atsiskaitomoji sąskaita 118 | aut 119 | # autorius 120 | b 121 | k 122 | b.k 123 | # banko kodas 124 | bkl 125 | # bakalauras 126 | bt 127 | # butas 128 | buv 129 | # buvęs, -usi 130 | dail 131 | # dailininkas 132 | dek 133 | # dekanas 134 | dėst 135 | # dėstytojas 136 | dir 137 | # direktorius 138 | dirig 139 | # dirigentas 140 | doc 141 | # docentas 142 | drp 143 | # durpynas 144 | dš 145 | # dešinysis 146 | egz 147 | # egzempliorius 148 | eil 149 | # eilutė 150 | ekon 151 | # ekonomika 152 | el 153 | # elektroninis 154 | etc 155 | ež 156 | # ežeras 157 | faks 158 | # faksas 159 | fak 160 | # fakultetas 161 | gen 162 | # generolas 163 | gyd 164 | # gydytojas 165 | gv 166 | # gyvenvietė 167 | įl 168 | # įlanka 169 | Įn 170 | # įnagininkas 171 | insp 172 | # inspektorius 173 | pan 174 | # ir panašiai 175 | t.t 176 | # ir taip toliau 177 | k.a 178 | # kaip antai 179 | kand 180 | # kandidatas 181 | kat 182 | # katedra 183 | kyš 184 | # kyšulys 185 | kl 186 | # klasė 187 | kln 188 | # kalnas 189 | kn 190 | # knyga 191 | koresp 192 | # korespondentas 193 | kpt 194 | # kapitonas 195 | kr 196 | # kairysis 197 | kt 198 | # kitas 199 | kun 200 | # kunigas 201 | l 202 | e 203 | p 204 | l.e.p 205 | # laikinai einantis pareigas 206 | ltn 207 | # leitenantas 208 | m 209 | mst 210 | # miestas 211 | m.e 212 | # mūsų eros 213 | m.m 214 | # mokslo metai 215 | mot 216 | # moteris 217 | mstl 218 | # miestelis 219 | mgr 220 | # magistras 221 | mgnt 222 | # magistrantas 223 | mjr 224 | # majoras 225 | mln 226 | # milijonas 227 | mlrd 228 | # milijardas 229 | mok 230 | # mokinys 231 | mokyt 232 | # mokytojas 233 | moksl 234 | # mokslinis 235 | nkt 236 | # nekaitomas 237 | ntk 238 | # neteiktinas 239 | Nr 240 | nr 241 | # numeris 242 | p 243 | # ponas 244 | p.d 245 | a.d 246 | # pašto dėžutė, abonentinė dėžutė 247 | p.m.e 248 | # prieš mūsų erą 249 | pan 250 | # ir panašiai 251 | pav 252 | # paveikslas 253 | pavad 254 | # pavaduotojas 255 | pirm 256 | # pirmininkas 257 | pl 258 | # plentas 259 | plg 260 | # palygink 261 | plk 262 | # pulkininkas; pelkė 263 | pr 264 | # prospektas 265 | Kr 266 | pr.Kr 267 | # prieš Kristų 268 | prok 269 | # prokuroras 270 | prot 271 | # protokolas 272 | pss 273 | # pusiasalis 274 | pšt 275 | # paštas 276 | pvz 277 | # pavyzdžiui 278 | r 279 | # rajonas 280 | red 281 | # redaktorius 282 | rš 283 | # raštų kalbos 284 | sąs 285 | # sąsiuvinis 286 | saviv 287 | sav 288 | # savivaldybė 289 | sekr 290 | # sekretorius 291 | sen 292 | # seniūnija, seniūnas 293 | sk 294 | # skaityk; skyrius 295 | skg 296 | # skersgatvis 297 | skyr 298 | sk 299 | # skyrius 300 | skv 301 | # skveras 302 | sp 303 | # spauda; spaustuvė 304 | spec 305 | # specialistas 306 | sr 307 | # sritis 308 | st 309 | # stotis 310 | str 311 | # straipsnis 312 | stud 313 | # studentas 314 | š 315 | š.m 316 | # šių metų 317 | šnek 318 | # šnekamosios 319 | tir 320 | # tiražas 321 | tūkst 322 | # tūkstantis 323 | up 324 | # upė 325 | upl 326 | # upelis 327 | vad 328 | # vadinamasis, -oji 329 | vlsč 330 | # valsčius 331 | ved 332 | # vedėjas 333 | vet 334 | # veterinarija 335 | virš 336 | # viršininkas, viršaitis 337 | vyr 338 | # vyriausiasis, -ioji; vyras 339 | vyresn 340 | # vyresnysis 341 | vlsč 342 | # valsčius 343 | vs 344 | # viensėdis 345 | Vt 346 | vt 347 | # vietininkas 348 | vtv 349 | vv 350 | # vietovardis 351 | žml 352 | # žemėlapis 353 | 354 | # Technical terms, abbreviations used in guidebooks, advertisments, etc. 355 | # Generally lower-case. 356 | air 357 | # airiškai 358 | amer 359 | # amerikanizmas 360 | anat 361 | # anatomija 362 | angl 363 | # angl. angliskai 364 | arab 365 | # arabų 366 | archeol 367 | archit 368 | asm 369 | # asmuo 370 | astr 371 | # astronomija 372 | austral 373 | # australiškai 374 | aut 375 | # automobilis 376 | av 377 | # aviacija 378 | bažn 379 | bdv 380 | # būdvardis 381 | bibl 382 | # Biblija 383 | biol 384 | # biologija 385 | bot 386 | # botanika 387 | brt 388 | # burtai, burtažodis. 389 | brus 390 | # baltarusių 391 | buh 392 | # buhalterija 393 | chem 394 | # chemija 395 | col 396 | # collectivum 397 | con 398 | conj 399 | # conjunctivus, jungtukas 400 | dab 401 | # dab. dabartine 402 | dgs 403 | # daugiskaita 404 | dial 405 | # dialektizmas 406 | dipl 407 | dktv 408 | # daiktavardis 409 | džn 410 | # dažnai 411 | ekon 412 | el 413 | # elektra 414 | esam 415 | # esamasis laikas 416 | euf 417 | # eufemizmas 418 | fam 419 | # familiariai 420 | farm 421 | # farmacija 422 | filol 423 | # filologija 424 | filos 425 | # filosofija 426 | fin 427 | # finansai 428 | fiz 429 | # fizika 430 | fiziol 431 | # fiziologija 432 | flk 433 | # folkloras 434 | fon 435 | # fonetika 436 | fot 437 | # fotografija 438 | geod 439 | # geodezija 440 | geogr 441 | geol 442 | # geologija 443 | geom 444 | # geometrija 445 | glžk 446 | gr 447 | # graikų 448 | gram 449 | her 450 | # heraldika 451 | hidr 452 | # hidrotechnika 453 | ind 454 | # Indų 455 | iron 456 | # ironiškai 457 | isp 458 | # ispanų 459 | ist 460 | istor 461 | # istorija 462 | it 463 | # italų 464 | įv 465 | reikšm 466 | įv.reikšm 467 | # įvairiomis reikšmėmis 468 | jap 469 | # japonų 470 | juok 471 | # juokaujamai 472 | jūr 473 | # jūrininkystė 474 | kalb 475 | # kalbotyra 476 | kar 477 | # karyba 478 | kas 479 | # kasyba 480 | kin 481 | # kinematografija 482 | klaus 483 | # klausiamasis 484 | knyg 485 | # knyginis 486 | kom 487 | # komercija 488 | komp 489 | # kompiuteris 490 | kosm 491 | # kosmonautika 492 | kt 493 | # kitas 494 | kul 495 | # kulinarija 496 | kuop 497 | # kuopine 498 | l 499 | # laikas 500 | lit 501 | # literatūrinis 502 | lingv 503 | # lingvistika 504 | log 505 | # logika 506 | lot 507 | # lotynų 508 | mat 509 | # matematika 510 | maž 511 | # mažybinis 512 | med 513 | # medicina 514 | medž 515 | # medžioklė 516 | men 517 | # menas 518 | menk 519 | # menkinamai 520 | metal 521 | # metalurgija 522 | meteor 523 | min 524 | # mineralogija 525 | mit 526 | # mitologija 527 | mok 528 | # mokyklinis 529 | ms 530 | # mįslė 531 | muz 532 | # muzikinis 533 | n 534 | # naujasis 535 | neig 536 | # neigiamasis 537 | neol 538 | # neologizmas 539 | niek 540 | # niekinamai 541 | ofic 542 | # oficialus 543 | opt 544 | # optika 545 | orig 546 | # original 547 | p 548 | # pietūs 549 | pan 550 | # panašiai 551 | parl 552 | # parlamentas 553 | pat 554 | # patarlė 555 | paž 556 | # pažodžiui 557 | plg 558 | # palygink 559 | poet 560 | # poetizmas 561 | poez 562 | # poezija 563 | poligr 564 | # poligrafija 565 | polit 566 | # politika 567 | ppr 568 | # paprastai 569 | pranc 570 | pr 571 | # prancūzų, prūsų 572 | priet 573 | # prietaras 574 | prek 575 | # prekyba 576 | prk 577 | # perkeltine 578 | prs 579 | # persona, asmuo 580 | psn 581 | # pasenęs žodis 582 | psich 583 | # psichologija 584 | pvz 585 | # pavyzdžiui 586 | r 587 | # rytai 588 | rad 589 | # radiotechnika 590 | rel 591 | # religija 592 | ret 593 | # retai 594 | rus 595 | # rusų 596 | sen 597 | # senasis 598 | sl 599 | # slengas, slavų 600 | sov 601 | # sovietinis 602 | spec 603 | # specialus 604 | sport 605 | stat 606 | # statyba 607 | sudurt 608 | # sudurtinis 609 | sutr 610 | # sutrumpintas 611 | suv 612 | # suvalkiečių 613 | š 614 | # šiaurė 615 | šach 616 | # šachmatai 617 | šiaur 618 | škot 619 | # škotiškai 620 | šnek 621 | # šnekamoji 622 | teatr 623 | tech 624 | techn 625 | # technika 626 | teig 627 | # teigiamas 628 | teis 629 | # teisė 630 | tekst 631 | # tekstilė 632 | tel 633 | # telefonas 634 | teol 635 | # teologija 636 | v 637 | # tik vyriškosios, vakarai 638 | t.p 639 | t 640 | p 641 | # ir taip pat 642 | t.t 643 | # ir taip toliau 644 | t.y 645 | # tai yra 646 | vaik 647 | # vaikų 648 | vart 649 | # vartojama 650 | vet 651 | # veterinarija 652 | vid 653 | # vidurinis 654 | vksm 655 | # veiksmažodis 656 | vns 657 | # vienaskaita 658 | vok 659 | # vokiečių 660 | vulg 661 | # vulgariai 662 | zool 663 | # zoologija 664 | žr 665 | # žiūrėk 666 | ž.ū 667 | ž 668 | ū 669 | # žemės ūkis 670 | 671 | # List of titles. These are often followed by upper-case names, but do 672 | # not indicate sentence breaks 673 | # 674 | # Jo Eminencija 675 | Em. 676 | # Gerbiamasis 677 | Gerb 678 | gerb 679 | # malonus 680 | malon 681 | # profesorius 682 | Prof 683 | prof 684 | # daktaras (mokslų) 685 | Dr 686 | dr 687 | habil 688 | med 689 | # inž inžinierius 690 | inž 691 | Inž 692 | 693 | 694 | #Numbers only. These should only induce breaks when followed by a numeric sequence 695 | # add NUMERIC_ONLY after the word for this function 696 | #This case is mostly for the english "No." which can either be a sentence of its own, or 697 | #if followed by a number, a non-breaking prefix 698 | No #NUMERIC_ONLY# 699 | -------------------------------------------------------------------------------- /bpe/learn_bpe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | 16 | import sys 17 | import codecs 18 | import re 19 | import copy 20 | import argparse 21 | from collections import defaultdict, Counter 22 | 23 | # hack for python2/3 compatibility 24 | from io import open 25 | argparse.open = open 26 | 27 | def create_parser(): 28 | parser = argparse.ArgumentParser( 29 | formatter_class=argparse.RawDescriptionHelpFormatter, 30 | description="learn BPE-based word segmentation") 31 | 32 | parser.add_argument( 33 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 34 | metavar='PATH', 35 | help="Input text (default: standard input).") 36 | 37 | parser.add_argument( 38 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 39 | metavar='PATH', 40 | help="Output file for BPE codes (default: standard output)") 41 | parser.add_argument( 42 | '--symbols', '-s', type=int, default=10000, 43 | help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))") 44 | parser.add_argument( 45 | '--min-frequency', type=int, default=2, metavar='FREQ', 46 | help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') 47 | parser.add_argument('--dict-input', action="store_true", 48 | help="If set, input file is interpreted as a dictionary where each line contains a word-count pair") 49 | parser.add_argument( 50 | '--verbose', '-v', action="store_true", 51 | help="verbose mode.") 52 | 53 | return parser 54 | 55 | def get_vocabulary(fobj, is_dict=False): 56 | """Read text and return dictionary that encodes vocabulary 57 | """ 58 | vocab = Counter() 59 | for line in fobj: 60 | if is_dict: 61 | word, count = line.strip().split() 62 | vocab[word] = int(count) 63 | else: 64 | for word in line.split(): 65 | vocab[word] += 1 66 | return vocab 67 | 68 | def update_pair_statistics(pair, changed, stats, indices): 69 | """Minimally update the indices and frequency of symbol pairs 70 | 71 | if we merge a pair of symbols, only pairs that overlap with occurrences 72 | of this pair are affected, and need to be updated. 73 | """ 74 | stats[pair] = 0 75 | indices[pair] = defaultdict(int) 76 | first, second = pair 77 | new_pair = first+second 78 | for j, word, old_word, freq in changed: 79 | 80 | # find all instances of pair, and update frequency/indices around it 81 | i = 0 82 | while True: 83 | # find first symbol 84 | try: 85 | i = old_word.index(first, i) 86 | except ValueError: 87 | break 88 | # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2]) 89 | if i < len(old_word)-1 and old_word[i+1] == second: 90 | # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B" 91 | if i: 92 | prev = old_word[i-1:i+1] 93 | stats[prev] -= freq 94 | indices[prev][j] -= 1 95 | if i < len(old_word)-2: 96 | # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B". 97 | # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block 98 | if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second: 99 | nex = old_word[i+1:i+3] 100 | stats[nex] -= freq 101 | indices[nex][j] -= 1 102 | i += 2 103 | else: 104 | i += 1 105 | 106 | i = 0 107 | while True: 108 | try: 109 | # find new pair 110 | i = word.index(new_pair, i) 111 | except ValueError: 112 | break 113 | # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC" 114 | if i: 115 | prev = word[i-1:i+1] 116 | stats[prev] += freq 117 | indices[prev][j] += 1 118 | # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B" 119 | # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block 120 | if i < len(word)-1 and word[i+1] != new_pair: 121 | nex = word[i:i+2] 122 | stats[nex] += freq 123 | indices[nex][j] += 1 124 | i += 1 125 | 126 | 127 | def get_pair_statistics(vocab): 128 | """Count frequency of all symbol pairs, and create index""" 129 | 130 | # data structure of pair frequencies 131 | stats = defaultdict(int) 132 | 133 | #index from pairs to words 134 | indices = defaultdict(lambda: defaultdict(int)) 135 | 136 | for i, (word, freq) in enumerate(vocab): 137 | prev_char = word[0] 138 | for char in word[1:]: 139 | stats[prev_char, char] += freq 140 | indices[prev_char, char][i] += 1 141 | prev_char = char 142 | 143 | return stats, indices 144 | 145 | 146 | def replace_pair(pair, vocab, indices): 147 | """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'""" 148 | first, second = pair 149 | pair_str = ''.join(pair) 150 | pair_str = pair_str.replace('\\','\\\\') 151 | changes = [] 152 | pattern = re.compile(r'(?'); 191 | # version numbering allows bckward compatibility 192 | outfile.write('#version: 0.2\n') 193 | 194 | vocab = get_vocabulary(infile, is_dict) 195 | vocab = dict([(tuple(x[:-1])+(x[-1]+'',) ,y) for (x,y) in vocab.items()]) 196 | sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) 197 | 198 | stats, indices = get_pair_statistics(sorted_vocab) 199 | big_stats = copy.deepcopy(stats) 200 | # threshold is inspired by Zipfian assumption, but should only affect speed 201 | threshold = max(stats.values()) / 10 202 | for i in range(num_symbols): 203 | if stats: 204 | most_frequent = max(stats, key=lambda x: (stats[x], x)) 205 | 206 | # we probably missed the best pair because of pruning; go back to full statistics 207 | if not stats or (i and stats[most_frequent] < threshold): 208 | prune_stats(stats, big_stats, threshold) 209 | stats = copy.deepcopy(big_stats) 210 | most_frequent = max(stats, key=lambda x: (stats[x], x)) 211 | # threshold is inspired by Zipfian assumption, but should only affect speed 212 | threshold = stats[most_frequent] * i/(i+10000.0) 213 | prune_stats(stats, big_stats, threshold) 214 | 215 | if stats[most_frequent] < min_frequency: 216 | sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency)) 217 | break 218 | 219 | if verbose: 220 | sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) 221 | outfile.write('{0} {1}\n'.format(*most_frequent)) 222 | changes = replace_pair(most_frequent, sorted_vocab, indices) 223 | update_pair_statistics(most_frequent, changes, stats, indices) 224 | stats[most_frequent] = 0 225 | if not i % 100: 226 | prune_stats(stats, big_stats, threshold) 227 | 228 | 229 | if __name__ == '__main__': 230 | 231 | # python 2/3 compatibility 232 | if sys.version_info < (3, 0): 233 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 234 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 235 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 236 | else: 237 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) 238 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) 239 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) 240 | 241 | parser = create_parser() 242 | args = parser.parse_args() 243 | 244 | # read/write files as UTF-8 245 | if args.input.name != '': 246 | args.input = codecs.open(args.input.name, encoding='utf-8') 247 | if args.output.name != '': 248 | args.output = codecs.open(args.output.name, 'w', encoding='utf-8') 249 | 250 | main(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) 251 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /bpe/apply_bpe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use operations learned with learn_bpe.py to encode a new text. 6 | The text will not be smaller, but use only a fixed vocabulary, with rare words 7 | encoded as variable-length sequences of subword units. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | from __future__ import unicode_literals, division 15 | 16 | import sys 17 | import codecs 18 | import io 19 | import argparse 20 | import re 21 | 22 | # hack for python2/3 compatibility 23 | from io import open 24 | argparse.open = open 25 | 26 | class BPE(object): 27 | 28 | def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None): 29 | 30 | # check version information 31 | firstline = codes.readline() 32 | if firstline.startswith('#version:'): 33 | self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")]) 34 | else: 35 | self.version = (0, 1) 36 | codes.seek(0) 37 | 38 | self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)] 39 | 40 | # some hacking to deal with duplicates (only consider first instance) 41 | self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) 42 | 43 | self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()]) 44 | 45 | self.separator = separator 46 | 47 | self.vocab = vocab 48 | 49 | self.glossaries = glossaries if glossaries else [] 50 | 51 | self.cache = {} 52 | 53 | def segment(self, sentence): 54 | """segment single sentence (whitespace-tokenized string) with BPE encoding""" 55 | output = [] 56 | for word in sentence.split(): 57 | new_word = [out for segment in self._isolate_glossaries(word) 58 | for out in encode(segment, 59 | self.bpe_codes, 60 | self.bpe_codes_reverse, 61 | self.vocab, 62 | self.separator, 63 | self.version, 64 | self.cache, 65 | self.glossaries)] 66 | 67 | for item in new_word[:-1]: 68 | output.append(item + self.separator) 69 | output.append(new_word[-1]) 70 | 71 | return ' '.join(output) 72 | 73 | def _isolate_glossaries(self, word): 74 | word_segments = [word] 75 | for gloss in self.glossaries: 76 | word_segments = [out_segments for segment in word_segments 77 | for out_segments in isolate_glossary(segment, gloss)] 78 | return word_segments 79 | 80 | def create_parser(): 81 | parser = argparse.ArgumentParser( 82 | formatter_class=argparse.RawDescriptionHelpFormatter, 83 | description="learn BPE-based word segmentation") 84 | 85 | parser.add_argument( 86 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 87 | metavar='PATH', 88 | help="Input file (default: standard input).") 89 | parser.add_argument( 90 | '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', 91 | required=True, 92 | help="File with BPE codes (created by learn_bpe.py).") 93 | parser.add_argument( 94 | '--merges', '-m', type=int, default=-1, 95 | metavar='INT', 96 | help="Use this many BPE operations (<= number of learned symbols)"+ 97 | "default: Apply all the learned merge operations") 98 | parser.add_argument( 99 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 100 | metavar='PATH', 101 | help="Output file (default: standard output)") 102 | parser.add_argument( 103 | '--separator', '-s', type=str, default='@@', metavar='STR', 104 | help="Separator between non-final subword units (default: '%(default)s'))") 105 | parser.add_argument( 106 | '--vocabulary', type=argparse.FileType('r'), default=None, 107 | metavar="PATH", 108 | help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.") 109 | parser.add_argument( 110 | '--vocabulary-threshold', type=int, default=None, 111 | metavar="INT", 112 | help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV") 113 | parser.add_argument( 114 | '--glossaries', type=str, nargs='+', default=None, 115 | metavar="STR", 116 | help="Glossaries. The strings provided in glossaries will not be affected"+ 117 | "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords") 118 | 119 | return parser 120 | 121 | def get_pairs(word): 122 | """Return set of symbol pairs in a word. 123 | 124 | word is represented as tuple of symbols (symbols being variable-length strings) 125 | """ 126 | pairs = set() 127 | prev_char = word[0] 128 | for char in word[1:]: 129 | pairs.add((prev_char, char)) 130 | prev_char = char 131 | return pairs 132 | 133 | def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries=None): 134 | """Encode word based on list of BPE merge operations, which are applied consecutively 135 | """ 136 | 137 | if orig in cache: 138 | return cache[orig] 139 | 140 | if orig in glossaries: 141 | cache[orig] = (orig,) 142 | return (orig,) 143 | 144 | if version == (0, 1): 145 | word = tuple(orig) + ('',) 146 | elif version == (0, 2): # more consistent handling of word-final segments 147 | word = tuple(orig[:-1]) + ( orig[-1] + '',) 148 | else: 149 | raise NotImplementedError 150 | 151 | pairs = get_pairs(word) 152 | 153 | if not pairs: 154 | return orig 155 | 156 | while True: 157 | bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf'))) 158 | if bigram not in bpe_codes: 159 | break 160 | first, second = bigram 161 | new_word = [] 162 | i = 0 163 | while i < len(word): 164 | try: 165 | j = word.index(first, i) 166 | new_word.extend(word[i:j]) 167 | i = j 168 | except: 169 | new_word.extend(word[i:]) 170 | break 171 | 172 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 173 | new_word.append(first+second) 174 | i += 2 175 | else: 176 | new_word.append(word[i]) 177 | i += 1 178 | new_word = tuple(new_word) 179 | word = new_word 180 | if len(word) == 1: 181 | break 182 | else: 183 | pairs = get_pairs(word) 184 | 185 | # don't print end-of-word symbols 186 | if word[-1] == '': 187 | word = word[:-1] 188 | elif word[-1].endswith(''): 189 | word = word[:-1] + (word[-1].replace('',''),) 190 | 191 | if vocab: 192 | word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator) 193 | 194 | cache[orig] = word 195 | return word 196 | 197 | def recursive_split(segment, bpe_codes, vocab, separator, final=False): 198 | """Recursively split segment into smaller units (by reversing BPE merges) 199 | until all units are either in-vocabulary, or cannot be split futher.""" 200 | 201 | try: 202 | if final: 203 | left, right = bpe_codes[segment + ''] 204 | right = right[:-4] 205 | else: 206 | left, right = bpe_codes[segment] 207 | except: 208 | #sys.stderr.write('cannot split {0} further.\n'.format(segment)) 209 | yield segment 210 | return 211 | 212 | if left + separator in vocab: 213 | yield left 214 | else: 215 | for item in recursive_split(left, bpe_codes, vocab, separator, False): 216 | yield item 217 | 218 | if (final and right in vocab) or (not final and right + separator in vocab): 219 | yield right 220 | else: 221 | for item in recursive_split(right, bpe_codes, vocab, separator, final): 222 | yield item 223 | 224 | def check_vocab_and_split(orig, bpe_codes, vocab, separator): 225 | """Check for each segment in word if it is in-vocabulary, 226 | and segment OOV segments into smaller units by reversing the BPE merge operations""" 227 | 228 | out = [] 229 | 230 | for segment in orig[:-1]: 231 | if segment + separator in vocab: 232 | out.append(segment) 233 | else: 234 | #sys.stderr.write('OOV: {0}\n'.format(segment)) 235 | for item in recursive_split(segment, bpe_codes, vocab, separator, False): 236 | out.append(item) 237 | 238 | segment = orig[-1] 239 | if segment in vocab: 240 | out.append(segment) 241 | else: 242 | #sys.stderr.write('OOV: {0}\n'.format(segment)) 243 | for item in recursive_split(segment, bpe_codes, vocab, separator, True): 244 | out.append(item) 245 | 246 | return out 247 | 248 | 249 | def read_vocabulary(vocab_file, threshold): 250 | """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold. 251 | """ 252 | 253 | vocabulary = set() 254 | 255 | for line in vocab_file: 256 | word, freq = line.split() 257 | freq = int(freq) 258 | if threshold == None or freq >= threshold: 259 | vocabulary.add(word) 260 | 261 | return vocabulary 262 | 263 | def isolate_glossary(word, glossary): 264 | """ 265 | Isolate a glossary present inside a word. 266 | 267 | Returns a list of subwords. In which all 'glossary' glossaries are isolated 268 | 269 | For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is: 270 | ['1934', 'USA', 'B', 'USA'] 271 | """ 272 | if word == glossary or glossary not in word: 273 | return [word] 274 | else: 275 | splits = word.split(glossary) 276 | segments = [segment.strip() for split in splits[:-1] for segment in [split, glossary] if segment != ''] 277 | return segments + [splits[-1].strip()] if splits[-1] != '' else segments 278 | 279 | if __name__ == '__main__': 280 | 281 | # python 2/3 compatibility 282 | if sys.version_info < (3, 0): 283 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 284 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 285 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 286 | else: 287 | sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 288 | sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') 289 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True) 290 | 291 | parser = create_parser() 292 | args = parser.parse_args() 293 | 294 | # read/write files as UTF-8 295 | args.codes = codecs.open(args.codes.name, encoding='utf-8') 296 | if args.input.name != '': 297 | args.input = codecs.open(args.input.name, encoding='utf-8') 298 | if args.output.name != '': 299 | args.output = codecs.open(args.output.name, 'w', encoding='utf-8') 300 | if args.vocabulary: 301 | args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') 302 | 303 | if args.vocabulary: 304 | vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) 305 | else: 306 | vocabulary = None 307 | 308 | bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) 309 | 310 | for line in args.input: 311 | args.output.write(bpe.segment(line).strip()) 312 | args.output.write('\n') 313 | -------------------------------------------------------------------------------- /scripts/tokenizer.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | 8 | # Sample Tokenizer 9 | ### Version 1.1 10 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn 11 | # Version 1.1 updates: 12 | # (1) add multithreading option "-threads NUM_THREADS" (default is 1); 13 | # (2) add a timing option "-time" to calculate the average speed of this tokenizer; 14 | # (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); 15 | ### Version 1.0 16 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ 17 | # written by Josh Schroeder, based on code by Philipp Koehn 18 | 19 | binmode(STDIN, ":utf8"); 20 | binmode(STDOUT, ":utf8"); 21 | 22 | use warnings; 23 | use FindBin qw($RealBin); 24 | use strict; 25 | use Time::HiRes; 26 | 27 | if (eval {require Thread;1;}) { 28 | #module loaded 29 | Thread->import(); 30 | } 31 | 32 | my $mydir = "$RealBin//nonbreaking_prefixes"; 33 | 34 | my %NONBREAKING_PREFIX = (); 35 | my @protected_patterns = (); 36 | my $protected_patterns_file = ""; 37 | my $language = "en"; 38 | my $QUIET = 0; 39 | my $HELP = 0; 40 | my $AGGRESSIVE = 0; 41 | my $SKIP_XML = 0; 42 | my $TIMING = 0; 43 | my $NUM_THREADS = 1; 44 | my $NUM_SENTENCES_PER_THREAD = 2000; 45 | my $PENN = 0; 46 | my $NO_ESCAPING = 0; 47 | while (@ARGV) 48 | { 49 | $_ = shift; 50 | /^-b$/ && ($| = 1, next); 51 | /^-l$/ && ($language = shift, next); 52 | /^-q$/ && ($QUIET = 1, next); 53 | /^-h$/ && ($HELP = 1, next); 54 | /^-x$/ && ($SKIP_XML = 1, next); 55 | /^-a$/ && ($AGGRESSIVE = 1, next); 56 | /^-time$/ && ($TIMING = 1, next); 57 | # Option to add list of regexps to be protected 58 | /^-protected/ && ($protected_patterns_file = shift, next); 59 | /^-threads$/ && ($NUM_THREADS = int(shift), next); 60 | /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); 61 | /^-penn$/ && ($PENN = 1, next); 62 | /^-no-escape/ && ($NO_ESCAPING = 1, next); 63 | } 64 | 65 | # for time calculation 66 | my $start_time; 67 | if ($TIMING) 68 | { 69 | $start_time = [ Time::HiRes::gettimeofday( ) ]; 70 | } 71 | 72 | # print help message 73 | if ($HELP) 74 | { 75 | print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; 76 | print "Options:\n"; 77 | print " -q ... quiet.\n"; 78 | print " -a ... aggressive hyphen splitting.\n"; 79 | print " -b ... disable Perl buffering.\n"; 80 | print " -time ... enable processing time calculation.\n"; 81 | print " -penn ... use Penn treebank-like tokenization.\n"; 82 | print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; 83 | print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; 84 | exit; 85 | } 86 | 87 | if (!$QUIET) 88 | { 89 | print STDERR "Tokenizer Version 1.1\n"; 90 | print STDERR "Language: $language\n"; 91 | print STDERR "Number of threads: $NUM_THREADS\n"; 92 | } 93 | 94 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes 95 | load_prefixes($language,\%NONBREAKING_PREFIX); 96 | 97 | if (scalar(%NONBREAKING_PREFIX) eq 0) 98 | { 99 | print STDERR "Warning: No known abbreviations for language '$language'\n"; 100 | } 101 | 102 | # Load protected patterns 103 | if ($protected_patterns_file) 104 | { 105 | open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file"; 106 | while() { 107 | chomp; 108 | push @protected_patterns, $_; 109 | } 110 | } 111 | 112 | my @batch_sentences = (); 113 | my @thread_list = (); 114 | my $count_sentences = 0; 115 | 116 | if ($NUM_THREADS > 1) 117 | {# multi-threading tokenization 118 | while() 119 | { 120 | $count_sentences = $count_sentences + 1; 121 | push(@batch_sentences, $_); 122 | if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) 123 | { 124 | # assign each thread work 125 | for (my $i=0; $i<$NUM_THREADS; $i++) 126 | { 127 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD; 128 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; 129 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; 130 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; 131 | push(@thread_list, $new_thread); 132 | } 133 | foreach (@thread_list) 134 | { 135 | my $tokenized_list = $_->join; 136 | foreach (@$tokenized_list) 137 | { 138 | print $_; 139 | } 140 | } 141 | # reset for the new run 142 | @thread_list = (); 143 | @batch_sentences = (); 144 | } 145 | } 146 | # the last batch 147 | if (scalar(@batch_sentences)>0) 148 | { 149 | # assign each thread work 150 | for (my $i=0; $i<$NUM_THREADS; $i++) 151 | { 152 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD; 153 | if ($start_index >= scalar(@batch_sentences)) 154 | { 155 | last; 156 | } 157 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; 158 | if ($end_index >= scalar(@batch_sentences)) 159 | { 160 | $end_index = scalar(@batch_sentences)-1; 161 | } 162 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; 163 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; 164 | push(@thread_list, $new_thread); 165 | } 166 | foreach (@thread_list) 167 | { 168 | my $tokenized_list = $_->join; 169 | foreach (@$tokenized_list) 170 | { 171 | print $_; 172 | } 173 | } 174 | } 175 | } 176 | else 177 | {# single thread only 178 | while() 179 | { 180 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 181 | { 182 | #don't try to tokenize XML/HTML tag lines 183 | print $_; 184 | } 185 | else 186 | { 187 | print &tokenize($_); 188 | } 189 | } 190 | } 191 | 192 | if ($TIMING) 193 | { 194 | my $duration = Time::HiRes::tv_interval( $start_time ); 195 | print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); 196 | print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); 197 | } 198 | 199 | ##################################################################################### 200 | # subroutines afterward 201 | 202 | # tokenize a batch of texts saved in an array 203 | # input: an array containing a batch of texts 204 | # return: another array containing a batch of tokenized texts for the input array 205 | sub tokenize_batch 206 | { 207 | my(@text_list) = @_; 208 | my(@tokenized_list) = (); 209 | foreach (@text_list) 210 | { 211 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 212 | { 213 | #don't try to tokenize XML/HTML tag lines 214 | push(@tokenized_list, $_); 215 | } 216 | else 217 | { 218 | push(@tokenized_list, &tokenize($_)); 219 | } 220 | } 221 | return \@tokenized_list; 222 | } 223 | 224 | # the actual tokenize function which tokenizes one input string 225 | # input: one string 226 | # return: the tokenized string for the input string 227 | sub tokenize 228 | { 229 | my($text) = @_; 230 | 231 | if ($PENN) { 232 | return tokenize_penn($text); 233 | } 234 | 235 | chomp($text); 236 | $text = " $text "; 237 | 238 | # remove ASCII junk 239 | $text =~ s/\s+/ /g; 240 | $text =~ s/[\000-\037]//g; 241 | 242 | # Find protected patterns 243 | my @protected = (); 244 | foreach my $protected_pattern (@protected_patterns) { 245 | my $t = $text; 246 | while ($t =~ /(?$protected_pattern)(?.*)$/) { 247 | push @protected, $+{PATTERN}; 248 | $t = $+{TAIL}; 249 | } 250 | } 251 | 252 | for (my $i = 0; $i < scalar(@protected); ++$i) { 253 | my $subst = sprintf("THISISPROTECTED%.3d", $i); 254 | $text =~ s,\Q$protected[$i], $subst ,g; 255 | } 256 | $text =~ s/ +/ /g; 257 | $text =~ s/^ //g; 258 | $text =~ s/ $//g; 259 | 260 | # seperate out all "other" special characters 261 | $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; 262 | 263 | # aggressive hyphen splitting 264 | if ($AGGRESSIVE) 265 | { 266 | $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g; 267 | } 268 | 269 | #multi-dots stay together 270 | $text =~ s/\.([\.]+)/ DOTMULTI$1/g; 271 | while($text =~ /DOTMULTI\./) 272 | { 273 | $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; 274 | $text =~ s/DOTMULTI\./DOTDOTMULTI/g; 275 | } 276 | 277 | # seperate out "," except if within numbers (5,300) 278 | #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 279 | 280 | # separate out "," except if within numbers (5,300) 281 | # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E 282 | # first application uses up B so rule can't see B,C 283 | # two-step version here may create extra spaces but these are removed later 284 | # will also space digit,letter or letter,digit forms (redundant with next section) 285 | $text =~ s/([^\p{IsN}])[,]/$1 , /g; 286 | $text =~ s/[,]([^\p{IsN}])/ , $1/g; 287 | 288 | # separate "," after a number if it's the end of a sentence 289 | $text =~ s/([\p{IsN}])[,]$/$1 ,/g; 290 | 291 | # separate , pre and post number 292 | #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 293 | #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 294 | 295 | # turn `into ' 296 | #$text =~ s/\`/\'/g; 297 | 298 | #turn '' into " 299 | #$text =~ s/\'\'/ \" /g; 300 | 301 | if ($language eq "en") 302 | { 303 | #split contractions right 304 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 305 | $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; 306 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 307 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; 308 | #special case for "1990's" 309 | $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; 310 | } 311 | elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga")) 312 | { 313 | #split contractions left 314 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 315 | $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; 316 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 317 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; 318 | } 319 | else 320 | { 321 | $text =~ s/\'/ \' /g; 322 | } 323 | 324 | #word token method 325 | my @words = split(/\s/,$text); 326 | $text = ""; 327 | for (my $i=0;$i<(scalar(@words));$i++) 328 | { 329 | my $word = $words[$i]; 330 | if ( $word =~ /^(\S+)\.$/) 331 | { 332 | my $pre = $1; 333 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml 377 | $text =~ s/\'/\'/g; # xml 378 | $text =~ s/\"/\"/g; # xml 379 | $text =~ s/\[/\[/g; # syntax non-terminal 380 | $text =~ s/\]/\]/g; # syntax non-terminal 381 | } 382 | 383 | #ensure final line break 384 | $text .= "\n" unless $text =~ /\n$/; 385 | 386 | return $text; 387 | } 388 | 389 | sub tokenize_penn 390 | { 391 | # Improved compatibility with Penn Treebank tokenization. Useful if 392 | # the text is to later be parsed with a PTB-trained parser. 393 | # 394 | # Adapted from Robert MacIntyre's sed script: 395 | # http://www.cis.upenn.edu/~treebank/tokenizer.sed 396 | 397 | my($text) = @_; 398 | chomp($text); 399 | 400 | # remove ASCII junk 401 | $text =~ s/\s+/ /g; 402 | $text =~ s/[\000-\037]//g; 403 | 404 | # attempt to get correct directional quotes 405 | $text =~ s/^``/`` /g; 406 | $text =~ s/^"/`` /g; 407 | $text =~ s/^`([^`])/` $1/g; 408 | $text =~ s/^'/` /g; 409 | $text =~ s/([ ([{<])"/$1 `` /g; 410 | $text =~ s/([ ([{<])``/$1 `` /g; 411 | $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; 412 | $text =~ s/([ ([{<])'/$1 ` /g; 413 | # close quotes handled at end 414 | 415 | $text =~ s=\.\.\.= _ELLIPSIS_ =g; 416 | 417 | # separate out "," except if within numbers (5,300) 418 | $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 419 | # separate , pre and post number 420 | $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 421 | $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 422 | 423 | #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; 424 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; 425 | 426 | # Separate out intra-token slashes. PTB tokenization doesn't do this, so 427 | # the tokens should be merged prior to parsing with a PTB-trained parser 428 | # (see syntax-hyphen-splitting.perl). 429 | $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; 430 | 431 | # Assume sentence tokenization has been done first, so split FINAL periods 432 | # only. 433 | $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; 434 | # however, we may as well split ALL question marks and exclamation points, 435 | # since they shouldn't have the abbrev.-marker ambiguity problem 436 | $text =~ s=([?!])= $1 =g; 437 | 438 | # parentheses, brackets, etc. 439 | $text =~ s=([\]\[\(\){}<>])= $1 =g; 440 | $text =~ s/\(/-LRB-/g; 441 | $text =~ s/\)/-RRB-/g; 442 | $text =~ s/\[/-LSB-/g; 443 | $text =~ s/\]/-RSB-/g; 444 | $text =~ s/{/-LCB-/g; 445 | $text =~ s/}/-RCB-/g; 446 | 447 | $text =~ s=--= -- =g; 448 | 449 | # First off, add a space to the beginning and end of each line, to reduce 450 | # necessary number of regexps. 451 | $text =~ s=$= =; 452 | $text =~ s=^= =; 453 | 454 | $text =~ s="= '' =g; 455 | # possessive or close-single-quote 456 | $text =~ s=([^'])' =$1 ' =g; 457 | # as in it's, I'm, we'd 458 | $text =~ s='([sSmMdD]) = '$1 =g; 459 | $text =~ s='ll = 'll =g; 460 | $text =~ s='re = 're =g; 461 | $text =~ s='ve = 've =g; 462 | $text =~ s=n't = n't =g; 463 | $text =~ s='LL = 'LL =g; 464 | $text =~ s='RE = 'RE =g; 465 | $text =~ s='VE = 'VE =g; 466 | $text =~ s=N'T = N'T =g; 467 | 468 | $text =~ s= ([Cc])annot = $1an not =g; 469 | $text =~ s= ([Dd])'ye = $1' ye =g; 470 | $text =~ s= ([Gg])imme = $1im me =g; 471 | $text =~ s= ([Gg])onna = $1on na =g; 472 | $text =~ s= ([Gg])otta = $1ot ta =g; 473 | $text =~ s= ([Ll])emme = $1em me =g; 474 | $text =~ s= ([Mm])ore'n = $1ore 'n =g; 475 | $text =~ s= '([Tt])is = '$1 is =g; 476 | $text =~ s= '([Tt])was = '$1 was =g; 477 | $text =~ s= ([Ww])anna = $1an na =g; 478 | 479 | #word token method 480 | my @words = split(/\s/,$text); 481 | $text = ""; 482 | for (my $i=0;$i<(scalar(@words));$i++) 483 | { 484 | my $word = $words[$i]; 485 | if ( $word =~ /^(\S+)\.$/) 486 | { 487 | my $pre = $1; 488 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml 517 | $text =~ s/\'/\'/g; # xml 518 | $text =~ s/\"/\"/g; # xml 519 | $text =~ s/\[/\[/g; # syntax non-terminal 520 | $text =~ s/\]/\]/g; # syntax non-terminal 521 | 522 | #ensure final line break 523 | $text .= "\n" unless $text =~ /\n$/; 524 | 525 | return $text; 526 | } 527 | 528 | sub load_prefixes 529 | { 530 | my ($language, $PREFIX_REF) = @_; 531 | 532 | my $prefixfile = "$mydir/nonbreaking_prefix.$language"; 533 | 534 | #default back to English if we don't have a language-specific prefix file 535 | if (!(-e $prefixfile)) 536 | { 537 | $prefixfile = "$mydir/nonbreaking_prefix.en"; 538 | print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; 539 | die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); 540 | } 541 | 542 | if (-e "$prefixfile") 543 | { 544 | open(PREFIX, "<:utf8", "$prefixfile"); 545 | while () 546 | { 547 | my $item = $_; 548 | chomp($item); 549 | if (($item) && (substr($item,0,1) ne "#")) 550 | { 551 | if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) 552 | { 553 | $PREFIX_REF->{$1} = 2; 554 | } 555 | else 556 | { 557 | $PREFIX_REF->{$item} = 1; 558 | } 559 | } 560 | } 561 | close(PREFIX); 562 | } 563 | } 564 | -------------------------------------------------------------------------------- /scripts/nonbreaking_prefixes/nonbreaking_prefix.el: -------------------------------------------------------------------------------- 1 | # Sigle letters in upper-case are usually abbreviations of names 2 | Α 3 | Β 4 | Γ 5 | Δ 6 | Ε 7 | Ζ 8 | Η 9 | Θ 10 | Ι 11 | Κ 12 | Λ 13 | Μ 14 | Ν 15 | Ξ 16 | Ο 17 | Π 18 | Ρ 19 | Σ 20 | Τ 21 | Υ 22 | Φ 23 | Χ 24 | Ψ 25 | Ω 26 | 27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content). 28 | Άθαν 29 | Έγχρ 30 | Έκθ 31 | Έσδ 32 | Έφ 33 | Όμ 34 | Α΄Έσδρ 35 | Α΄Έσδ 36 | Α΄Βασ 37 | Α΄Θεσ 38 | Α΄Ιω 39 | Α΄Κορινθ 40 | Α΄Κορ 41 | Α΄Μακκ 42 | Α΄Μακ 43 | Α΄Πέτρ 44 | Α΄Πέτ 45 | Α΄Παραλ 46 | Α΄Πε 47 | Α΄Σαμ 48 | Α΄Τιμ 49 | Α΄Χρον 50 | Α΄Χρ 51 | Α.Β.Α 52 | Α.Β 53 | Α.Ε 54 | Α.Κ.Τ.Ο 55 | Αέθλ 56 | Αέτ 57 | Αίλ.Δ 58 | Αίλ.Τακτ 59 | Αίσ 60 | Αββακ 61 | Αβυδ 62 | Αβ 63 | Αγάκλ 64 | Αγάπ 65 | Αγάπ.Αμαρτ.Σ 66 | Αγάπ.Γεωπ 67 | Αγαθάγγ 68 | Αγαθήμ 69 | Αγαθιν 70 | Αγαθοκλ 71 | Αγαθρχ 72 | Αγαθ 73 | Αγαθ.Ιστ 74 | Αγαλλ 75 | Αγαπητ 76 | Αγγ 77 | Αγησ 78 | Αγλ 79 | Αγορ.Κ 80 | Αγρο.Κωδ 81 | Αγρ.Εξ 82 | Αγρ.Κ 83 | Αγ.Γρ 84 | Αδριαν 85 | Αδρ 86 | Αετ 87 | Αθάν 88 | Αθήν 89 | Αθήν.Επιγρ 90 | Αθήν.Επιτ 91 | Αθήν.Ιατρ 92 | Αθήν.Μηχ 93 | Αθανάσ 94 | Αθαν 95 | Αθηνί 96 | Αθηναγ 97 | Αθηνόδ 98 | Αθ 99 | Αθ.Αρχ 100 | Αιλ 101 | Αιλ.Επιστ 102 | Αιλ.ΖΙ 103 | Αιλ.ΠΙ 104 | Αιλ.απ 105 | Αιμιλ 106 | Αιν.Γαζ 107 | Αιν.Τακτ 108 | Αισχίν 109 | Αισχίν.Επιστ 110 | Αισχ 111 | Αισχ.Αγαμ 112 | Αισχ.Αγ 113 | Αισχ.Αλ 114 | Αισχ.Ελεγ 115 | Αισχ.Επτ.Θ 116 | Αισχ.Ευμ 117 | Αισχ.Ικέτ 118 | Αισχ.Ικ 119 | Αισχ.Περσ 120 | Αισχ.Προμ.Δεσμ 121 | Αισχ.Πρ 122 | Αισχ.Χοηφ 123 | Αισχ.Χο 124 | Αισχ.απ 125 | ΑιτΕ 126 | Αιτ 127 | Αλκ 128 | Αλχιας 129 | Αμ.Π.Ο 130 | Αμβ 131 | Αμμών 132 | Αμ. 133 | Αν.Πειθ.Συμβ.Δικ 134 | Ανακρ 135 | Ανακ 136 | Αναμν.Τόμ 137 | Αναπλ 138 | Ανδ 139 | Ανθλγος 140 | Ανθστης 141 | Αντισθ 142 | Ανχης 143 | Αν 144 | Αποκ 145 | Απρ 146 | Απόδ 147 | Απόφ 148 | Απόφ.Νομ 149 | Απ 150 | Απ.Δαπ 151 | Απ.Διατ 152 | Απ.Επιστ 153 | Αριθ 154 | Αριστοτ 155 | Αριστοφ 156 | Αριστοφ.Όρν 157 | Αριστοφ.Αχ 158 | Αριστοφ.Βάτρ 159 | Αριστοφ.Ειρ 160 | Αριστοφ.Εκκλ 161 | Αριστοφ.Θεσμ 162 | Αριστοφ.Ιππ 163 | Αριστοφ.Λυσ 164 | Αριστοφ.Νεφ 165 | Αριστοφ.Πλ 166 | Αριστοφ.Σφ 167 | Αριστ 168 | Αριστ.Αθ.Πολ 169 | Αριστ.Αισθ 170 | Αριστ.Αν.Πρ 171 | Αριστ.Ζ.Ι 172 | Αριστ.Ηθ.Ευδ 173 | Αριστ.Ηθ.Νικ 174 | Αριστ.Κατ 175 | Αριστ.Μετ 176 | Αριστ.Πολ 177 | Αριστ.Φυσιογν 178 | Αριστ.Φυσ 179 | Αριστ.Ψυχ 180 | Αριστ.Ρητ 181 | Αρμεν 182 | Αρμ 183 | Αρχ.Εκ.Καν.Δ 184 | Αρχ.Ευβ.Μελ 185 | Αρχ.Ιδ.Δ 186 | Αρχ.Νομ 187 | Αρχ.Ν 188 | Αρχ.Π.Ε 189 | Αρ 190 | Αρ.Φορ.Μητρ 191 | Ασμ 192 | Ασμ.ασμ 193 | Αστ.Δ 194 | Αστ.Χρον 195 | Ασ 196 | Ατομ.Γνωμ 197 | Αυγ 198 | Αφρ 199 | Αχ.Νομ 200 | Α 201 | Α.Εγχ.Π 202 | Α.Κ.΄Υδρας 203 | Β΄Έσδρ 204 | Β΄Έσδ 205 | Β΄Βασ 206 | Β΄Θεσ 207 | Β΄Ιω 208 | Β΄Κορινθ 209 | Β΄Κορ 210 | Β΄Μακκ 211 | Β΄Μακ 212 | Β΄Πέτρ 213 | Β΄Πέτ 214 | Β΄Πέ 215 | Β΄Παραλ 216 | Β΄Σαμ 217 | Β΄Τιμ 218 | Β΄Χρον 219 | Β΄Χρ 220 | Β.Ι.Π.Ε 221 | Β.Κ.Τ 222 | Β.Κ.Ψ.Β 223 | Β.Μ 224 | Β.Ο.Α.Κ 225 | Β.Ο.Α 226 | Β.Ο.Δ 227 | Βίβλ 228 | Βαρ 229 | ΒεΘ 230 | Βι.Περ 231 | Βιπερ 232 | Βιργ 233 | Βλγ 234 | Βούλ 235 | Βρ 236 | Γ΄Βασ 237 | Γ΄Μακκ 238 | ΓΕΝμλ 239 | Γέν 240 | Γαλ 241 | Γεν 242 | Γλ 243 | Γν.Ν.Σ.Κρ 244 | Γνωμ 245 | Γν 246 | Γράμμ 247 | Γρηγ.Ναζ 248 | Γρηγ.Νύσ 249 | Γ Νοσ 250 | Γ' Ογκολ 251 | Γ.Ν 252 | Δ΄Βασ 253 | Δ.Β 254 | Δ.Δίκη 255 | Δ.Δίκ 256 | Δ.Ε.Σ 257 | Δ.Ε.Φ.Α 258 | Δ.Ε.Φ 259 | Δ.Εργ.Ν 260 | Δαμ 261 | Δαμ.μνημ.έργ 262 | Δαν 263 | Δασ.Κ 264 | Δεκ 265 | Δελτ.Δικ.Ε.Τ.Ε 266 | Δελτ.Νομ 267 | Δελτ.Συνδ.Α.Ε 268 | Δερμ 269 | Δευτ 270 | Δεύτ 271 | Δημοσθ 272 | Δημόκρ 273 | Δι.Δικ 274 | Διάτ 275 | Διαιτ.Απ 276 | Διαιτ 277 | Διαρκ.Στρατ 278 | Δικ 279 | Διοίκ.Πρωτ 280 | ΔιοικΔνη 281 | Διοικ.Εφ 282 | Διον.Αρ 283 | Διόρθ.Λαθ 284 | Δ.κ.Π 285 | Δνη 286 | Δν 287 | Δογμ.Όρος 288 | Δρ 289 | Δ.τ.Α 290 | Δτ 291 | ΔωδΝομ 292 | Δ.Περ 293 | Δ.Στρ 294 | ΕΔΠολ 295 | ΕΕυρΚ 296 | ΕΙΣ 297 | ΕΝαυτΔ 298 | ΕΣΑμΕΑ 299 | ΕΣΘ 300 | ΕΣυγκΔ 301 | ΕΤρΑξΧρΔ 302 | Ε.Φ.Ε.Τ 303 | Ε.Φ.Ι 304 | Ε.Φ.Ο.Επ.Α 305 | Εβδ 306 | Εβρ 307 | Εγκύκλ.Επιστ 308 | Εγκ 309 | Εε.Αιγ 310 | Εθν.Κ.Τ 311 | Εθν 312 | Ειδ.Δικ.Αγ.Κακ 313 | Εικ 314 | Ειρ.Αθ 315 | Ειρην.Αθ 316 | Ειρην 317 | Έλεγχ 318 | Ειρ 319 | Εισ.Α.Π 320 | Εισ.Ε 321 | Εισ.Ν.Α.Κ 322 | Εισ.Ν.Κ.Πολ.Δ 323 | Εισ.Πρωτ 324 | Εισηγ.Έκθ 325 | Εισ 326 | Εκκλ 327 | Εκκ 328 | Εκ 329 | Ελλ.Δνη 330 | Εν.Ε 331 | Εξ 332 | Επ.Αν 333 | Επ.Εργ.Δ 334 | Επ.Εφ 335 | Επ.Κυπ.Δ 336 | Επ.Μεσ.Αρχ 337 | Επ.Νομ 338 | Επίκτ 339 | Επίκ 340 | Επι.Δ.Ε 341 | Επιθ.Ναυτ.Δικ 342 | Επικ 343 | Επισκ.Ε.Δ 344 | Επισκ.Εμπ.Δικ 345 | Επιστ.Επετ.Αρμ 346 | Επιστ.Επετ 347 | Επιστ.Ιερ 348 | Επιτρ.Προστ.Συνδ.Στελ 349 | Επιφάν 350 | Επτ.Εφ 351 | Επ.Ιρ 352 | Επ.Ι 353 | Εργ.Ασφ.Νομ 354 | Ερμ.Α.Κ 355 | Ερμη.Σ 356 | Εσθ 357 | Εσπερ 358 | Ετρ.Δ 359 | Ευκλ 360 | Ευρ.Δ.Δ.Α 361 | Ευρ.Σ.Δ.Α 362 | Ευρ.ΣτΕ 363 | Ευρατόμ 364 | Ευρ.Άλκ 365 | Ευρ.Ανδρομ 366 | Ευρ.Βάκχ 367 | Ευρ.Εκ 368 | Ευρ.Ελ 369 | Ευρ.Ηλ 370 | Ευρ.Ηρακ 371 | Ευρ.Ηρ 372 | Ευρ.Ηρ.Μαιν 373 | Ευρ.Ικέτ 374 | Ευρ.Ιππόλ 375 | Ευρ.Ιφ.Α 376 | Ευρ.Ιφ.Τ 377 | Ευρ.Ι.Τ 378 | Ευρ.Κύκλ 379 | Ευρ.Μήδ 380 | Ευρ.Ορ 381 | Ευρ.Ρήσ 382 | Ευρ.Τρωάδ 383 | Ευρ.Φοίν 384 | Εφ.Αθ 385 | Εφ.Εν 386 | Εφ.Επ 387 | Εφ.Θρ 388 | Εφ.Θ 389 | Εφ.Ι 390 | Εφ.Κερ 391 | Εφ.Κρ 392 | Εφ.Λ 393 | Εφ.Ν 394 | Εφ.Πατ 395 | Εφ.Πειρ 396 | Εφαρμ.Δ.Δ 397 | Εφαρμ 398 | Εφεσ 399 | Εφημ 400 | Εφ 401 | Ζαχ 402 | Ζιγ 403 | Ζυ 404 | Ζχ 405 | ΗΕ.Δ 406 | Ημερ 407 | Ηράκλ 408 | Ηροδ 409 | Ησίοδ 410 | Ησ 411 | Η.Ε.Γ 412 | ΘΗΣ 413 | ΘΡ 414 | Θαλ 415 | Θεοδ 416 | Θεοφ 417 | Θεσ 418 | Θεόδ.Μοψ 419 | Θεόκρ 420 | Θεόφιλ 421 | Θουκ 422 | Θρ 423 | Θρ.Ε 424 | Θρ.Ιερ 425 | Θρ.Ιρ 426 | Ιακ 427 | Ιαν 428 | Ιβ 429 | Ιδθ 430 | Ιδ 431 | Ιεζ 432 | Ιερ 433 | Ιζ 434 | Ιησ 435 | Ιησ.Ν 436 | Ικ 437 | Ιλ 438 | Ιν 439 | Ιουδ 440 | Ιουστ 441 | Ιούδα 442 | Ιούλ 443 | Ιούν 444 | Ιπποκρ 445 | Ιππόλ 446 | Ιρ 447 | Ισίδ.Πηλ 448 | Ισοκρ 449 | Ισ.Ν 450 | Ιωβ 451 | Ιωλ 452 | Ιων 453 | Ιω 454 | ΚΟΣ 455 | ΚΟ.ΜΕ.ΚΟΝ 456 | ΚΠοινΔ 457 | ΚΠολΔ 458 | ΚαΒ 459 | Καλ 460 | Καλ.Τέχν 461 | ΚανΒ 462 | Καν.Διαδ 463 | Κατάργ 464 | Κλ 465 | ΚοινΔ 466 | Κολσ 467 | Κολ 468 | Κον 469 | Κορ 470 | Κος 471 | ΚριτΕπιθ 472 | ΚριτΕ 473 | Κριτ 474 | Κρ 475 | ΚτΒ 476 | ΚτΕ 477 | ΚτΠ 478 | Κυβ 479 | Κυπρ 480 | Κύριλ.Αλεξ 481 | Κύριλ.Ιερ 482 | Λεβ 483 | Λεξ.Σουίδα 484 | Λευϊτ 485 | Λευ 486 | Λκ 487 | Λογ 488 | ΛουκΑμ 489 | Λουκιαν 490 | Λουκ.Έρωτ 491 | Λουκ.Ενάλ.Διάλ 492 | Λουκ.Ερμ 493 | Λουκ.Εταιρ.Διάλ 494 | Λουκ.Ε.Δ 495 | Λουκ.Θε.Δ 496 | Λουκ.Ικ. 497 | Λουκ.Ιππ 498 | Λουκ.Λεξιφ 499 | Λουκ.Μεν 500 | Λουκ.Μισθ.Συν 501 | Λουκ.Ορχ 502 | Λουκ.Περ 503 | Λουκ.Συρ 504 | Λουκ.Τοξ 505 | Λουκ.Τυρ 506 | Λουκ.Φιλοψ 507 | Λουκ.Φιλ 508 | Λουκ.Χάρ 509 | Λουκ. 510 | Λουκ.Αλ 511 | Λοχ 512 | Λυδ 513 | Λυκ 514 | Λυσ 515 | Λωζ 516 | Λ1 517 | Λ2 518 | ΜΟΕφ 519 | Μάρκ 520 | Μέν 521 | Μαλ 522 | Ματθ 523 | Μα 524 | Μιχ 525 | Μκ 526 | Μλ 527 | Μμ 528 | Μον.Δ.Π 529 | Μον.Πρωτ 530 | Μον 531 | Μρ 532 | Μτ 533 | Μχ 534 | Μ.Βασ 535 | Μ.Πλ 536 | ΝΑ 537 | Ναυτ.Χρον 538 | Να 539 | Νδικ 540 | Νεεμ 541 | Νε 542 | Νικ 543 | ΝκΦ 544 | Νμ 545 | ΝοΒ 546 | Νομ.Δελτ.Τρ.Ελ 547 | Νομ.Δελτ 548 | Νομ.Σ.Κ 549 | Νομ.Χρ 550 | Νομ 551 | Νομ.Διεύθ 552 | Νοσ 553 | Ντ 554 | Νόσων 555 | Ν1 556 | Ν2 557 | Ν3 558 | Ν4 559 | Νtot 560 | Ξενοφ 561 | Ξεν 562 | Ξεν.Ανάβ 563 | Ξεν.Απολ 564 | Ξεν.Απομν 565 | Ξεν.Απομ 566 | Ξεν.Ελλ 567 | Ξεν.Ιέρ 568 | Ξεν.Ιππαρχ 569 | Ξεν.Ιππ 570 | Ξεν.Κυρ.Αν 571 | Ξεν.Κύρ.Παιδ 572 | Ξεν.Κ.Π 573 | Ξεν.Λακ.Πολ 574 | Ξεν.Οικ 575 | Ξεν.Προσ 576 | Ξεν.Συμπόσ 577 | Ξεν.Συμπ 578 | Ο΄ 579 | Οβδ 580 | Οβ 581 | ΟικΕ 582 | Οικ 583 | Οικ.Πατρ 584 | Οικ.Σύν.Βατ 585 | Ολομ 586 | Ολ 587 | Ολ.Α.Π 588 | Ομ.Ιλ 589 | Ομ.Οδ 590 | ΟπΤοιχ 591 | Οράτ 592 | Ορθ 593 | ΠΡΟ.ΠΟ 594 | Πίνδ 595 | Πίνδ.Ι 596 | Πίνδ.Νεμ 597 | Πίνδ.Ν 598 | Πίνδ.Ολ 599 | Πίνδ.Παθ 600 | Πίνδ.Πυθ 601 | Πίνδ.Π 602 | ΠαγΝμλγ 603 | Παν 604 | Παρμ 605 | Παροιμ 606 | Παρ 607 | Παυσ 608 | Πειθ.Συμβ 609 | ΠειρΝ 610 | Πελ 611 | ΠεντΣτρ 612 | Πεντ 613 | Πεντ.Εφ 614 | ΠερΔικ 615 | Περ.Γεν.Νοσ 616 | Πετ 617 | Πλάτ 618 | Πλάτ.Αλκ 619 | Πλάτ.Αντ 620 | Πλάτ.Αξίοχ 621 | Πλάτ.Απόλ 622 | Πλάτ.Γοργ 623 | Πλάτ.Ευθ 624 | Πλάτ.Θεαίτ 625 | Πλάτ.Κρατ 626 | Πλάτ.Κριτ 627 | Πλάτ.Λύσ 628 | Πλάτ.Μεν 629 | Πλάτ.Νόμ 630 | Πλάτ.Πολιτ 631 | Πλάτ.Πολ 632 | Πλάτ.Πρωτ 633 | Πλάτ.Σοφ. 634 | Πλάτ.Συμπ 635 | Πλάτ.Τίμ 636 | Πλάτ.Φαίδρ 637 | Πλάτ.Φιλ 638 | Πλημ 639 | Πλούτ 640 | Πλούτ.Άρατ 641 | Πλούτ.Αιμ 642 | Πλούτ.Αλέξ 643 | Πλούτ.Αλκ 644 | Πλούτ.Αντ 645 | Πλούτ.Αρτ 646 | Πλούτ.Ηθ 647 | Πλούτ.Θεμ 648 | Πλούτ.Κάμ 649 | Πλούτ.Καίσ 650 | Πλούτ.Κικ 651 | Πλούτ.Κράσ 652 | Πλούτ.Κ 653 | Πλούτ.Λυκ 654 | Πλούτ.Μάρκ 655 | Πλούτ.Μάρ 656 | Πλούτ.Περ 657 | Πλούτ.Ρωμ 658 | Πλούτ.Σύλλ 659 | Πλούτ.Φλαμ 660 | Πλ 661 | Ποιν.Δικ 662 | Ποιν.Δ 663 | Ποιν.Ν 664 | Ποιν.Χρον 665 | Ποιν.Χρ 666 | Πολ.Δ 667 | Πολ.Πρωτ 668 | Πολ 669 | Πολ.Μηχ 670 | Πολ.Μ 671 | Πρακτ.Αναθ 672 | Πρακτ.Ολ 673 | Πραξ 674 | Πρμ 675 | Πρξ 676 | Πρωτ 677 | Πρ 678 | Πρ.Αν 679 | Πρ.Λογ 680 | Πταισμ 681 | Πυρ.Καλ 682 | Πόλη 683 | Π.Δ 684 | Π.Δ.Άσμ 685 | ΡΜ.Ε 686 | Ρθ 687 | Ρμ 688 | Ρωμ 689 | ΣΠλημ 690 | Σαπφ 691 | Σειρ 692 | Σολ 693 | Σοφ 694 | Σοφ.Αντιγ 695 | Σοφ.Αντ 696 | Σοφ.Αποσ 697 | Σοφ.Απ 698 | Σοφ.Ηλέκ 699 | Σοφ.Ηλ 700 | Σοφ.Οιδ.Κολ 701 | Σοφ.Οιδ.Τύρ 702 | Σοφ.Ο.Τ 703 | Σοφ.Σειρ 704 | Σοφ.Σολ 705 | Σοφ.Τραχ 706 | Σοφ.Φιλοκτ 707 | Σρ 708 | Σ.τ.Ε 709 | Σ.τ.Π 710 | Στρ.Π.Κ 711 | Στ.Ευρ 712 | Συζήτ 713 | Συλλ.Νομολ 714 | Συλ.Νομ 715 | ΣυμβΕπιθ 716 | Συμπ.Ν 717 | Συνθ.Αμ 718 | Συνθ.Ε.Ε 719 | Συνθ.Ε.Κ 720 | Συνθ.Ν 721 | Σφν 722 | Σφ 723 | Σφ.Σλ 724 | Σχ.Πολ.Δ 725 | Σχ.Συντ.Ε 726 | Σωσ 727 | Σύντ 728 | Σ.Πληρ 729 | ΤΘ 730 | ΤΣ.Δ 731 | Τίτ 732 | Τβ 733 | Τελ.Ενημ 734 | Τελ.Κ 735 | Τερτυλ 736 | Τιμ 737 | Τοπ.Α 738 | Τρ.Ο 739 | Τριμ 740 | Τριμ.Πλ 741 | Τρ.Πλημ 742 | Τρ.Π.Δ 743 | Τ.τ.Ε 744 | Ττ 745 | Τωβ 746 | Υγ 747 | Υπερ 748 | Υπ 749 | Υ.Γ 750 | Φιλήμ 751 | Φιλιπ 752 | Φιλ 753 | Φλμ 754 | Φλ 755 | Φορ.Β 756 | Φορ.Δ.Ε 757 | Φορ.Δνη 758 | Φορ.Δ 759 | Φορ.Επ 760 | Φώτ 761 | Χρ.Ι.Δ 762 | Χρ.Ιδ.Δ 763 | Χρ.Ο 764 | Χρυσ 765 | Ψήφ 766 | Ψαλμ 767 | Ψαλ 768 | Ψλ 769 | Ωριγ 770 | Ωσ 771 | Ω.Ρ.Λ 772 | άγν 773 | άγν.ετυμολ 774 | άγ 775 | άκλ 776 | άνθρ 777 | άπ 778 | άρθρ 779 | άρν 780 | άρ 781 | άτ 782 | άψ 783 | ά 784 | έκδ 785 | έκφρ 786 | έμψ 787 | ένθ.αν 788 | έτ 789 | έ.α 790 | ίδ 791 | αβεστ 792 | αβησσ 793 | αγγλ 794 | αγγ 795 | αδημ 796 | αεροναυτ 797 | αερον 798 | αεροπ 799 | αθλητ 800 | αθλ 801 | αθροιστ 802 | αιγυπτ 803 | αιγ 804 | αιτιολ 805 | αιτ 806 | αι 807 | ακαδ 808 | ακκαδ 809 | αλβ 810 | αλλ 811 | αλφαβητ 812 | αμα 813 | αμερικ 814 | αμερ 815 | αμετάβ 816 | αμτβ 817 | αμφιβ 818 | αμφισβ 819 | αμφ 820 | αμ 821 | ανάλ 822 | ανάπτ 823 | ανάτ 824 | αναβ 825 | αναδαν 826 | αναδιπλασ 827 | αναδιπλ 828 | αναδρ 829 | αναλ 830 | αναν 831 | ανασυλλ 832 | ανατολ 833 | ανατομ 834 | ανατυπ 835 | ανατ 836 | αναφορ 837 | αναφ 838 | ανα.ε 839 | ανδρων 840 | ανθρωπολ 841 | ανθρωπ 842 | ανθ 843 | ανομ 844 | αντίτ 845 | αντδ 846 | αντιγρ 847 | αντιθ 848 | αντικ 849 | αντιμετάθ 850 | αντων 851 | αντ 852 | ανωτ 853 | ανόργ 854 | ανών 855 | αορ 856 | απαρέμφ 857 | απαρφ 858 | απαρχ 859 | απαρ 860 | απλολ 861 | απλοπ 862 | αποβ 863 | αποηχηροπ 864 | αποθ 865 | αποκρυφ 866 | αποφ 867 | απρμφ 868 | απρφ 869 | απρόσ 870 | απόδ 871 | απόλ 872 | απόσπ 873 | απόφ 874 | αραβοτουρκ 875 | αραβ 876 | αραμ 877 | αρβαν 878 | αργκ 879 | αριθμτ 880 | αριθμ 881 | αριθ 882 | αρκτικόλ 883 | αρκ 884 | αρμεν 885 | αρμ 886 | αρνητ 887 | αρσ 888 | αρχαιολ 889 | αρχιτεκτ 890 | αρχιτ 891 | αρχκ 892 | αρχ 893 | αρωμουν 894 | αρωμ 895 | αρ 896 | αρ.μετρ 897 | αρ.φ 898 | ασσυρ 899 | αστρολ 900 | αστροναυτ 901 | αστρον 902 | αττ 903 | αυστραλ 904 | αυτοπ 905 | αυτ 906 | αφγαν 907 | αφηρ 908 | αφομ 909 | αφρικ 910 | αχώρ 911 | αόρ 912 | α.α 913 | α/α 914 | α0 915 | βαθμ 916 | βαθ 917 | βαπτ 918 | βασκ 919 | βεβαιωτ 920 | βεβ 921 | βεδ 922 | βενετ 923 | βεν 924 | βερβερ 925 | βιβλγρ 926 | βιολ 927 | βιομ 928 | βιοχημ 929 | βιοχ 930 | βλάχ 931 | βλ 932 | βλ.λ 933 | βοταν 934 | βοτ 935 | βουλγαρ 936 | βουλγ 937 | βούλ 938 | βραζιλ 939 | βρετον 940 | βόρ 941 | γαλλ 942 | γενικότ 943 | γενοβ 944 | γεν 945 | γερμαν 946 | γερμ 947 | γεωγρ 948 | γεωλ 949 | γεωμετρ 950 | γεωμ 951 | γεωπ 952 | γεωργ 953 | γλυπτ 954 | γλωσσολ 955 | γλωσσ 956 | γλ 957 | γνμδ 958 | γνμ 959 | γνωμ 960 | γοτθ 961 | γραμμ 962 | γραμ 963 | γρμ 964 | γρ 965 | γυμν 966 | δίδες 967 | δίκ 968 | δίφθ 969 | δαν 970 | δεικτ 971 | δεκατ 972 | δηλ 973 | δημογρ 974 | δημοτ 975 | δημώδ 976 | δημ 977 | διάγρ 978 | διάκρ 979 | διάλεξ 980 | διάλ 981 | διάσπ 982 | διαλεκτ 983 | διατρ 984 | διαφ 985 | διαχ 986 | διδα 987 | διεθν 988 | διεθ 989 | δικον 990 | διστ 991 | δισύλλ 992 | δισ 993 | διφθογγοπ 994 | δογμ 995 | δολ 996 | δοτ 997 | δρμ 998 | δρχ 999 | δρ(α) 1000 | δωρ 1001 | δ 1002 | εβρ 1003 | εγκλπ 1004 | εδ 1005 | εθνολ 1006 | εθν 1007 | ειδικότ 1008 | ειδ 1009 | ειδ.β 1010 | εικ 1011 | ειρ 1012 | εισ 1013 | εκατοστμ 1014 | εκατοστ 1015 | εκατστ.2 1016 | εκατστ.3 1017 | εκατ 1018 | εκδ 1019 | εκκλησ 1020 | εκκλ 1021 | εκ 1022 | ελλην 1023 | ελλ 1024 | ελνστ 1025 | ελπ 1026 | εμβ 1027 | εμφ 1028 | εναλλ 1029 | ενδ 1030 | ενεργ 1031 | ενεστ 1032 | ενικ 1033 | ενν 1034 | εν 1035 | εξέλ 1036 | εξακολ 1037 | εξομάλ 1038 | εξ 1039 | εο 1040 | επέκτ 1041 | επίδρ 1042 | επίθ 1043 | επίρρ 1044 | επίσ 1045 | επαγγελμ 1046 | επανάλ 1047 | επανέκδ 1048 | επιθ 1049 | επικ 1050 | επιμ 1051 | επιρρ 1052 | επιστ 1053 | επιτατ 1054 | επιφ 1055 | επών 1056 | επ 1057 | εργ 1058 | ερμ 1059 | ερρινοπ 1060 | ερωτ 1061 | ετρουσκ 1062 | ετυμ 1063 | ετ 1064 | ευφ 1065 | ευχετ 1066 | εφ 1067 | εύχρ 1068 | ε.α 1069 | ε/υ 1070 | ε0 1071 | ζωγρ 1072 | ζωολ 1073 | ηθικ 1074 | ηθ 1075 | ηλεκτρολ 1076 | ηλεκτρον 1077 | ηλεκτρ 1078 | ημίτ 1079 | ημίφ 1080 | ημιφ 1081 | ηχηροπ 1082 | ηχηρ 1083 | ηχομιμ 1084 | ηχ 1085 | η 1086 | θέατρ 1087 | θεολ 1088 | θετ 1089 | θηλ 1090 | θρακ 1091 | θρησκειολ 1092 | θρησκ 1093 | θ 1094 | ιαπων 1095 | ιατρ 1096 | ιδιωμ 1097 | ιδ 1098 | ινδ 1099 | ιραν 1100 | ισπαν 1101 | ιστορ 1102 | ιστ 1103 | ισχυροπ 1104 | ιταλ 1105 | ιχθυολ 1106 | ιων 1107 | κάτ 1108 | καθ 1109 | κακοσ 1110 | καν 1111 | καρ 1112 | κατάλ 1113 | κατατ 1114 | κατωτ 1115 | κατ 1116 | κα 1117 | κελτ 1118 | κεφ 1119 | κινεζ 1120 | κινημ 1121 | κλητ 1122 | κλιτ 1123 | κλπ 1124 | κλ 1125 | κν 1126 | κοινωνιολ 1127 | κοινων 1128 | κοπτ 1129 | κουτσοβλαχ 1130 | κουτσοβλ 1131 | κπ 1132 | κρ.γν 1133 | κτγ 1134 | κτην 1135 | κτητ 1136 | κτλ 1137 | κτ 1138 | κυριολ 1139 | κυρ 1140 | κύρ 1141 | κ 1142 | κ.ά 1143 | κ.ά.π 1144 | κ.α 1145 | κ.εξ 1146 | κ.επ 1147 | κ.ε 1148 | κ.λπ 1149 | κ.λ.π 1150 | κ.ού.κ 1151 | κ.ο.κ 1152 | κ.τ.λ 1153 | κ.τ.τ 1154 | κ.τ.ό 1155 | λέξ 1156 | λαογρ 1157 | λαπ 1158 | λατιν 1159 | λατ 1160 | λαϊκότρ 1161 | λαϊκ 1162 | λετ 1163 | λιθ 1164 | λογιστ 1165 | λογοτ 1166 | λογ 1167 | λουβ 1168 | λυδ 1169 | λόγ 1170 | λ 1171 | λ.χ 1172 | μέλλ 1173 | μέσ 1174 | μαθημ 1175 | μαθ 1176 | μαιευτ 1177 | μαλαισ 1178 | μαλτ 1179 | μαμμων 1180 | μεγεθ 1181 | μεε 1182 | μειωτ 1183 | μελ 1184 | μεξ 1185 | μεσν 1186 | μεσογ 1187 | μεσοπαθ 1188 | μεσοφ 1189 | μετάθ 1190 | μεταβτ 1191 | μεταβ 1192 | μετακ 1193 | μεταπλ 1194 | μεταπτωτ 1195 | μεταρ 1196 | μεταφορ 1197 | μετβ 1198 | μετεπιθ 1199 | μετεπιρρ 1200 | μετεωρολ 1201 | μετεωρ 1202 | μετον 1203 | μετουσ 1204 | μετοχ 1205 | μετρ 1206 | μετ 1207 | μητρων 1208 | μηχανολ 1209 | μηχ 1210 | μικροβιολ 1211 | μογγολ 1212 | μορφολ 1213 | μουσ 1214 | μπενελούξ 1215 | μσνλατ 1216 | μσν 1217 | μτβ 1218 | μτγν 1219 | μτγ 1220 | μτφρδ 1221 | μτφρ 1222 | μτφ 1223 | μτχ 1224 | μυθ 1225 | μυκην 1226 | μυκ 1227 | μφ 1228 | μ 1229 | μ.ε 1230 | μ.μ 1231 | μ.π.ε 1232 | μ.π.π 1233 | μ0 1234 | ναυτ 1235 | νεοελλ 1236 | νεολατιν 1237 | νεολατ 1238 | νεολ 1239 | νεότ 1240 | νλατ 1241 | νομ 1242 | νορβ 1243 | νοσ 1244 | νότ 1245 | ν 1246 | ξ.λ 1247 | οικοδ 1248 | οικολ 1249 | οικον 1250 | οικ 1251 | ολλανδ 1252 | ολλ 1253 | ομηρ 1254 | ομόρρ 1255 | ονομ 1256 | ον 1257 | οπτ 1258 | ορθογρ 1259 | ορθ 1260 | οριστ 1261 | ορυκτολ 1262 | ορυκτ 1263 | ορ 1264 | οσετ 1265 | οσκ 1266 | ουαλ 1267 | ουγγρ 1268 | ουδ 1269 | ουσιαστικοπ 1270 | ουσιαστ 1271 | ουσ 1272 | πίν 1273 | παθητ 1274 | παθολ 1275 | παθ 1276 | παιδ 1277 | παλαιοντ 1278 | παλαιότ 1279 | παλ 1280 | παππων 1281 | παράγρ 1282 | παράγ 1283 | παράλλ 1284 | παράλ 1285 | παραγ 1286 | παρακ 1287 | παραλ 1288 | παραπ 1289 | παρατ 1290 | παρβ 1291 | παρετυμ 1292 | παροξ 1293 | παρων 1294 | παρωχ 1295 | παρ 1296 | παρ.φρ 1297 | πατριδων 1298 | πατρων 1299 | πβ 1300 | περιθ 1301 | περιλ 1302 | περιφρ 1303 | περσ 1304 | περ 1305 | πιθ 1306 | πληθ 1307 | πληροφ 1308 | ποδ 1309 | ποιητ 1310 | πολιτ 1311 | πολλαπλ 1312 | πολ 1313 | πορτογαλ 1314 | πορτ 1315 | ποσ 1316 | πρακριτ 1317 | πρβλ 1318 | πρβ 1319 | πργ 1320 | πρκμ 1321 | πρκ 1322 | πρλ 1323 | προέλ 1324 | προβηγκ 1325 | προελλ 1326 | προηγ 1327 | προθεμ 1328 | προπαραλ 1329 | προπαροξ 1330 | προπερισπ 1331 | προσαρμ 1332 | προσηγορ 1333 | προσταχτ 1334 | προστ 1335 | προσφών 1336 | προσ 1337 | προτακτ 1338 | προτ.Εισ 1339 | προφ 1340 | προχωρ 1341 | πρτ 1342 | πρόθ 1343 | πρόσθ 1344 | πρόσ 1345 | πρότ 1346 | πρ 1347 | πρ.Εφ 1348 | πτ 1349 | πυ 1350 | π 1351 | π.Χ 1352 | π.μ 1353 | π.χ 1354 | ρήμ 1355 | ρίζ 1356 | ρηματ 1357 | ρητορ 1358 | ριν 1359 | ρουμ 1360 | ρωμ 1361 | ρωσ 1362 | ρ 1363 | σανσκρ 1364 | σαξ 1365 | σελ 1366 | σερβοκρ 1367 | σερβ 1368 | σημασιολ 1369 | σημδ 1370 | σημειολ 1371 | σημερ 1372 | σημιτ 1373 | σημ 1374 | σκανδ 1375 | σκυθ 1376 | σκωπτ 1377 | σλαβ 1378 | σλοβ 1379 | σουηδ 1380 | σουμερ 1381 | σουπ 1382 | σπάν 1383 | σπανιότ 1384 | σπ 1385 | σσ 1386 | στατ 1387 | στερ 1388 | στιγμ 1389 | στιχ 1390 | στρέμ 1391 | στρατιωτ 1392 | στρατ 1393 | στ 1394 | συγγ 1395 | συγκρ 1396 | συγκ 1397 | συμπερ 1398 | συμπλεκτ 1399 | συμπλ 1400 | συμπροφ 1401 | συμφυρ 1402 | συμφ 1403 | συνήθ 1404 | συνίζ 1405 | συναίρ 1406 | συναισθ 1407 | συνδετ 1408 | συνδ 1409 | συνεκδ 1410 | συνηρ 1411 | συνθετ 1412 | συνθ 1413 | συνοπτ 1414 | συντελ 1415 | συντομογρ 1416 | συντ 1417 | συν 1418 | συρ 1419 | σχημ 1420 | σχ 1421 | σύγκρ 1422 | σύμπλ 1423 | σύμφ 1424 | σύνδ 1425 | σύνθ 1426 | σύντμ 1427 | σύντ 1428 | σ 1429 | σ.π 1430 | σ/β 1431 | τακτ 1432 | τελ 1433 | τετρ 1434 | τετρ.μ 1435 | τεχνλ 1436 | τεχνολ 1437 | τεχν 1438 | τεύχ 1439 | τηλεπικ 1440 | τηλεόρ 1441 | τιμ 1442 | τιμ.τομ 1443 | τοΣ 1444 | τον 1445 | τοπογρ 1446 | τοπων 1447 | τοπ 1448 | τοσκ 1449 | τουρκ 1450 | τοχ 1451 | τριτοπρόσ 1452 | τροποπ 1453 | τροπ 1454 | τσεχ 1455 | τσιγγ 1456 | ττ 1457 | τυπ 1458 | τόμ 1459 | τόνν 1460 | τ 1461 | τ.μ 1462 | τ.χλμ 1463 | υβρ 1464 | υπερθ 1465 | υπερσ 1466 | υπερ 1467 | υπεύθ 1468 | υποθ 1469 | υποκορ 1470 | υποκ 1471 | υποσημ 1472 | υποτ 1473 | υποφ 1474 | υποχωρ 1475 | υπόλ 1476 | υπόχρ 1477 | υπ 1478 | υστλατ 1479 | υψόμ 1480 | υψ 1481 | φάκ 1482 | φαρμακολ 1483 | φαρμ 1484 | φιλολ 1485 | φιλοσ 1486 | φιλοτ 1487 | φινλ 1488 | φοινικ 1489 | φράγκ 1490 | φρανκον 1491 | φριζ 1492 | φρ 1493 | φυλλ 1494 | φυσιολ 1495 | φυσ 1496 | φωνηεντ 1497 | φωνητ 1498 | φωνολ 1499 | φων 1500 | φωτογρ 1501 | φ 1502 | φ.τ.μ 1503 | χαμιτ 1504 | χαρτόσ 1505 | χαρτ 1506 | χασμ 1507 | χαϊδ 1508 | χγφ 1509 | χειλ 1510 | χεττ 1511 | χημ 1512 | χιλ 1513 | χλγρ 1514 | χλγ 1515 | χλμ 1516 | χλμ.2 1517 | χλμ.3 1518 | χλσγρ 1519 | χλστγρ 1520 | χλστμ 1521 | χλστμ.2 1522 | χλστμ.3 1523 | χλ 1524 | χργρ 1525 | χρημ 1526 | χρον 1527 | χρ 1528 | χφ 1529 | χ.ε 1530 | χ.κ 1531 | χ.ο 1532 | χ.σ 1533 | χ.τ 1534 | χ.χ 1535 | ψευδ 1536 | ψυχαν 1537 | ψυχιατρ 1538 | ψυχολ 1539 | ψυχ 1540 | ωκεαν 1541 | όμ 1542 | όν 1543 | όπ.παρ 1544 | όπ.π 1545 | ό.π 1546 | ύψ 1547 | 1Βσ 1548 | 1Εσ 1549 | 1Θσ 1550 | 1Ιν 1551 | 1Κρ 1552 | 1Μκ 1553 | 1Πρ 1554 | 1Πτ 1555 | 1Τμ 1556 | 2Βσ 1557 | 2Εσ 1558 | 2Θσ 1559 | 2Ιν 1560 | 2Κρ 1561 | 2Μκ 1562 | 2Πρ 1563 | 2Πτ 1564 | 2Τμ 1565 | 3Βσ 1566 | 3Ιν 1567 | 3Μκ 1568 | 4Βσ 1569 | --------------------------------------------------------------------------------