├── .gitignore
├── src
    └── mosestokenizer
    │   ├── nonbreaking_prefixes
    │       ├── README.txt
    │       ├── nonbreaking_prefix.ro
    │       ├── nonbreaking_prefix.sv
    │       ├── nonbreaking_prefix.ca
    │       ├── nonbreaking_prefix.sl
    │       ├── nonbreaking_prefix.es
    │       ├── nonbreaking_prefix.lv
    │       ├── nonbreaking_prefix.fr
    │       ├── nonbreaking_prefix.en
    │       ├── nonbreaking_prefix.fi
    │       ├── nonbreaking_prefix.hu
    │       ├── nonbreaking_prefix.nl
    │       ├── nonbreaking_prefix.is
    │       ├── nonbreaking_prefix.it
    │       ├── nonbreaking_prefix.pl
    │       ├── nonbreaking_prefix.pt
    │       ├── nonbreaking_prefix.ru
    │       ├── nonbreaking_prefix.ta
    │       ├── nonbreaking_prefix.de
    │       ├── nonbreaking_prefix.cs
    │       ├── nonbreaking_prefix.sk
    │       └── nonbreaking_prefix.el
    │   ├── __init__.py
    │   ├── normalize-punctuation.perl
    │   ├── detokenizer.py
    │   ├── punctnormalizer.py
    │   ├── tokenizer.py
    │   ├── tokenizer-v1.0.perl
    │   ├── split-sentences.perl
    │   ├── sentsplitter.py
    │   ├── detokenizer.perl
    │   └── tokenizer-v1.1.perl
├── setup.py
├── README.rst
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | __pycache__/
3 | *.pyc
4 | dist/
5 | build/
6 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Wrappers for several pre-processing scripts from the Moses toolkit.
 3 | 
 4 | Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
 5 | 
 6 | This package provides wrappers for the following Perl scripts:
 7 | 
 8 | ``tokenizer.perl``
 9 |     class `mosestokenizer.tokenizer.MosesTokenizer`
10 | 
11 | ``split-sentences.perl``
12 |     class `mosestokenizer.sentsplitter.MosesSentenceSplitter`
13 | 
14 | ``normalize-punctuation.perl``
15 |     class `mosestokenizer.punctnormalizer.MosesPunctuationNormalizer`
16 | 
17 | """
18 | 
19 | from mosestokenizer.tokenizer import MosesTokenizer
20 | from mosestokenizer.detokenizer import MosesDetokenizer
21 | from mosestokenizer.sentsplitter import MosesSentenceSplitter
22 | from mosestokenizer.punctnormalizer import MosesPunctuationNormalizer
23 | 
24 | __version__ = "1.2.1"
25 | 
26 | __all__ = [
27 |     "MosesTokenizer",
28 |     "MosesDetokenizer",
29 |     "MosesSentenceSplitter",
30 |     "MosesPunctuationNormalizer",
31 | ]
32 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Inc
 59 | Insp
 60 | Lt
 61 | MM
 62 | MR
 63 | MRS
 64 | MS
 65 | Maj
 66 | Messrs
 67 | Mlle
 68 | Mme
 69 | Mr
 70 | Mrs
 71 | Ms
 72 | Msgr
 73 | Op
 74 | Ord
 75 | Pfc
 76 | Ph
 77 | Prof
 78 | Pvt
 79 | Rep
 80 | Reps
 81 | Res
 82 | Rev
 83 | Rt
 84 | Sen
 85 | Sens
 86 | Sfc
 87 | Sgt
 88 | Sr
 89 | St
 90 | Supt
 91 | Surg
 92 | 
 93 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 94 | v
 95 | vs
 96 | i.e
 97 | rev
 98 | e.g
 99 | 
100 | #Numbers only. These should only induce breaks when followed by a numeric sequence
101 | # add NUMERIC_ONLY after the word for this function
102 | #This case is mostly for the english "No." which can either be a sentence of its own, or
103 | #if followed by a number, a non-breaking prefix
104 | No #NUMERIC_ONLY#
105 | Nos
106 | Art #NUMERIC_ONLY#
107 | Nr
108 | pp #NUMERIC_ONLY#
109 | 
110 | #month abbreviations
111 | Jan
112 | Feb
113 | Mar
114 | Apr
115 | #May is a full word
116 | Jun
117 | Jul
118 | Aug
119 | Sep
120 | Oct
121 | Nov
122 | Dec
123 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from os import path
 3 | import re
 4 | 
 5 | 
 6 | def packagefile(*relpath):
 7 |     return path.join(path.dirname(__file__), *relpath)
 8 | 
 9 | 
10 | def read(*relpath):
11 |     with open(packagefile(*relpath)) as f:
12 |         return f.read()
13 | 
14 | 
15 | def get_version(*relpath):
16 |     match = re.search(
17 |         r'''^__version__ = ['"]([^'"]*)['"]''',
18 |         read(*relpath),
19 |         re.M
20 |     )
21 |     if not match:
22 |         raise RuntimeError('Unable to find version string.')
23 |     return match.group(1)
24 | 
25 | 
26 | setup(
27 |     name='mosestokenizer',
28 |     version=get_version('src', 'mosestokenizer', '__init__.py'),
29 |     description='Wrappers for several pre-processing scripts from the Moses'
30 |                 ' toolkit.',
31 |     long_description=read('README.rst'),
32 |     url='https://github.com/luismsgomes/mosestokenizer',
33 |     author='Luís Gomes',
34 |     author_email='luismsgomes@gmail.com',
35 |     license='LGPLv2',
36 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
37 |     classifiers=[
38 |         'Development Status :: 5 - Production/Stable',
39 |         'Intended Audience :: Developers',
40 |         'Topic :: Text Processing :: Linguistic',
41 |         'License :: OSI Approved :: GNU Lesser General Public License v2'
42 |             ' or later (LGPLv2+)',
43 |         'Programming Language :: Python :: 3.5',
44 |     ],
45 |     keywords='text tokenization pre-processing',
46 |     install_requires=[
47 |         "docopt",
48 |         "openfile",
49 |         "uctools",
50 |         "toolwrapper",
51 |     ],
52 |     packages=find_packages('src'),
53 |     package_dir={'': 'src'},
54 |     package_data={
55 |         'mosestokenizer': [
56 |             '*.perl',
57 |             'nonbreaking_prefixes/*.*'
58 |         ],
59 |     },
60 |     entry_points={
61 |         'console_scripts': [
62 |             'moses-tokenizer=mosestokenizer.tokenizer:main',
63 |             'moses-detokenizer=mosestokenizer.detokenizer:main',
64 |             'moses-punct-normalizer=mosestokenizer.punctnormalizer:main',
65 |             'moses-sent-splitter=mosestokenizer.sentsplitter:main'
66 |         ],
67 |     },
68 | )
69 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/normalize-punctuation.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | my $language = "en";
10 | my $PENN = 0;
11 | 
12 | while (@ARGV) {
13 |     $_ = shift;
14 |     /^-b$/ && ($| = 1, next); # not buffered (flush each line)
15 |     /^-l$/ && ($language = shift, next);
16 |     /^[^\-]/ && ($language = $_, next);
17 |   	/^-penn$/ && ($PENN = 1, next);
18 | }
19 | 
20 | while(<STDIN>) {
21 |     s/\r//g;
22 |     # remove extra spaces
23 |     s/\(/ \(/g;
24 |     s/\)/\) /g; s/ +/ /g;
25 |     s/\) ([\.\!\:\?\;\,])/\)$1/g;
26 |     s/\( /\(/g;
27 |     s/ \)/\)/g;
28 |     s/(\d) \%/$1\%/g;
29 |     s/ :/:/g;
30 |     s/ ;/;/g;
31 |     # normalize unicode punctuation
32 |     if ($PENN == 0) {
33 |       s/\`/\'/g;
34 |       s/\'\'/ \" /g;
35 |     }
36 | 
37 |     s/„/\"/g;
38 |     s/“/\"/g;
39 |     s/”/\"/g;
40 |     s/–/-/g;
41 |     s/—/ - /g; s/ +/ /g;
42 |     s/´/\'/g;
43 |     s/([a-z])‘([a-z])/$1\'$2/gi;
44 |     s/([a-z])’([a-z])/$1\'$2/gi;
45 |     s/‘/\"/g;
46 |     s/‚/\"/g;
47 |     s/’/\"/g;
48 |     s/''/\"/g;
49 |     s/´´/\"/g;
50 |     s/…/.../g;
51 |     # French quotes
52 |     s/ « / \"/g;
53 |     s/« /\"/g;
54 |     s/«/\"/g;
55 |     s/ » /\" /g;
56 |     s/ »/\"/g;
57 |     s/»/\"/g;
58 |     # handle pseudo-spaces
59 |     s/ \%/\%/g;
60 |     s/nº /nº /g;
61 |     s/ :/:/g;
62 |     s/ ºC/ ºC/g;
63 |     s/ cm/ cm/g;
64 |     s/ \?/\?/g;
65 |     s/ \!/\!/g;
66 |     s/ ;/;/g;
67 |     s/, /, /g; s/ +/ /g;
68 | 
69 |     # English "quotation," followed by comma, style
70 |     if ($language eq "en") {
71 | 	s/\"([,\.]+)/$1\"/g;
72 |     }
73 |     # Czech is confused
74 |     elsif ($language eq "cs" || $language eq "cz") {
75 |     }
76 |     # German/Spanish/French "quotation", followed by comma, style
77 |     else {
78 | 	s/,\"/\",/g;	
79 | 	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
80 |     }
81 | 
82 | 
83 |     if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
84 | 	s/(\d) (\d)/$1,$2/g;
85 |     }
86 |     else {
87 | 	s/(\d) (\d)/$1.$2/g;
88 |     }
89 |     print $_;
90 | }
91 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/detokenizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A module for interfacing with ``detokenizer.perl`` from Moses.
 3 | 
 4 | Copyright ® 2017, Luís Gomes <luismsgomes@gmail.com>
 5 | """
 6 | 
 7 | usage = """
 8 | Usage:
 9 |     moses-detokenizer [options] <lang> [<inputfile> [<outputfile>]]
10 |     moses-detokenizer --selftest [--verbose]
11 | 
12 | Options:
13 |     --selftest, -t  Run selftests.
14 |     --verbose, -v   Be more verbose.
15 | 
16 | 2017, Luís Gomes <luismsgomes@gmail.com>
17 | """
18 | 
19 | 
20 | from docopt import docopt
21 | from openfile import openfile
22 | from os import path
23 | from toolwrapper import ToolWrapper
24 | import sys
25 | 
26 | 
27 | class MosesDetokenizer(ToolWrapper):
28 |     """A module for interfacing with ``detokenizer.perl`` from Moses.
29 | 
30 |     This class communicates with detokenizer.perl process via pipes. When the
31 |     MosesDetokenizer object is no longer needed, the close() method should be
32 |     called to free system resources. The class supports the context manager
33 |     interface. If used in a with statement, the close() method is invoked
34 |     automatically.
35 | 
36 |     >>> detokenize = MosesDetokenizer('en')
37 |     >>> detokenize('Hello', 'World', '!')
38 |     'Hello World!'
39 |     """
40 | 
41 |     def __init__(self, lang="en"):
42 |         self.lang = lang
43 |         program = path.join(path.dirname(__file__), "detokenizer.perl")
44 |         # -q = quiet
45 |         # -b = disable output buffering
46 |         argv = ["perl", program, "-q", "-b", "-l", self.lang]
47 |         super().__init__(argv)
48 | 
49 |     def __str__(self):
50 |         return "MosesDetokenizer(lang=\"{lang}\")".format(lang=self.lang)
51 | 
52 |     def __call__(self, sentence):
53 |         """Detokenizes a single sentence.
54 | 
55 |         Newline characters are not allowed in tokens.
56 |         """
57 |         assert isinstance(sentence, (list, tuple))
58 |         assert all(isinstance(token, str) for token in sentence)
59 |         assert all("\n" not in token for token in sentence)
60 |         if not sentence:
61 |             return ""
62 |         self.writeline(" ".join(sentence))
63 |         return self.readline()
64 | 
65 | 
66 | def main():
67 |     args = docopt(usage)
68 |     if args["--selftest"]:
69 |         import doctest
70 |         import mosestokenizer.detokenizer
71 |         doctest.testmod(mosestokenizer.detokenizer)
72 |         if not args["<lang>"]:
73 |             sys.exit(0)
74 |     detokenize = MosesDetokenizer(args["<lang>"])
75 |     inputfile = openfile(args["<inputfile>"])
76 |     outputfile = openfile(args["<outputfile>"], "wt")
77 |     with inputfile, outputfile:
78 |         for line in inputfile:
79 |             print(detokenize(line.split()), file=outputfile)
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/punctnormalizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A module for interfacing with ``normalize-punctuation.perl`` from Moses.
 3 | 
 4 | Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
 5 | """
 6 | 
 7 | usage = """
 8 | Usage:
 9 |     moses-punct-normalizer [options] <lang> [<inputfile> [<outputfile>]]
10 |     moses-punct-normalizer --selftest [--verbose]
11 | 
12 | Options:
13 |     --selftest, -t  Run selftests.
14 |     --verbose, -v   Be more verbose.
15 | 
16 | 2016, Luís Gomes <luismsgomes@gmail.com>
17 | """
18 | 
19 | 
20 | from docopt import docopt
21 | from os import path
22 | from toolwrapper import ToolWrapper
23 | import sys
24 | 
25 | 
26 | class MosesPunctuationNormalizer(ToolWrapper):
27 |     """A module for interfacing with ``normalize-punctuation.perl`` from Moses.
28 | 
29 |     This class communicates with normalize-punctuation.perl process via pipes.
30 |     When the MosesPunctuationNormalizer object is no longer needed, the close()
31 |     method should be called to free system resources. The class supports the
32 |     context manager interface. If used in a with statement, the close() method
33 |     is invoked automatically.
34 | 
35 |     >>> normalize = MosesPunctuationNormalizer("en")
36 |     >>> normalize("«Hello World» — she said…")
37 |     '"Hello World" - she said...'
38 |     """
39 | 
40 |     def __init__(self, lang="en"):
41 |         self.lang = lang
42 |         program = path.join(
43 |             path.dirname(__file__),
44 |             "normalize-punctuation.perl"
45 |         )
46 |         argv = ["perl", program, "-b", "-l", self.lang]
47 |         super().__init__(argv)
48 | 
49 |     def __str__(self):
50 |         return "MosesPunctuationNormalizer(lang=\"{lang}\")".format(
51 |             lang=self.lang
52 |         )
53 | 
54 |     def __call__(self, line):
55 |         """Normalizes punctuation of a single line of text.
56 | 
57 |         Newline characters are not allowed in the text to be normalized.
58 |         """
59 |         assert isinstance(line, str)
60 |         line = line.strip()
61 |         assert "\n" not in line
62 |         if not line:
63 |             return []
64 |         self.writeline(line)
65 |         return self.readline()
66 | 
67 | 
68 | def main():
69 |     args = docopt(usage)
70 |     if args["--selftest"]:
71 |         import doctest
72 |         import mosestokenizer.punctnormalizer
73 |         doctest.testmod(mosestokenizer.punctnormalizer)
74 |         if not args["<lang>"]:
75 |             sys.exit(0)
76 |     normalize = MosesPunctuationNormalizer(args["<lang>"])
77 |     inputfile = open(args["<inputfile>"]) if args["<inputfile>"] else sys.stdin
78 |     outputfile = open(args["<outputfile>"], "wt") if args["<outputfile>"] else sys.stdout
79 |     with inputfile, outputfile:
80 |         for line in inputfile:
81 |             print(normalize(line), file=outputfile)
82 | 
83 | if __name__ == "__main__":
84 |     main()
85 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | no #NUMERIC_ONLY#
206 | Nos
207 | Art #NUMERIC_ONLY#
208 | art #NUMERIC_ONLY#
209 | Nr
210 | p #NUMERIC_ONLY#
211 | pp #NUMERIC_ONLY#
212 | 
213 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/tokenizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A module for interfacing with ``tokenizer.perl`` from Moses.
  3 | 
  4 | Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
  5 | """
  6 | 
  7 | usage = """
  8 | Usage:
  9 |     moses-tokenizer [options] <lang> [<inputfile> [<outputfile>]]
 10 |     moses-tokenizer --selftest [--verbose]
 11 | 
 12 | Options:
 13 |     --selftest, -t  Run selftests.
 14 |     --verbose, -v   Be more verbose.
 15 |     --old           Use older version (1.0) of the tokenizer.
 16 |                     If this option is not given, then version 1.1
 17 |                     will be used.
 18 |     --no-escape     Do not escape output for HTML.
 19 | 
 20 | 2016, Luís Gomes <luismsgomes@gmail.com>
 21 | """
 22 | 
 23 | 
 24 | from docopt import docopt
 25 | from openfile import openfile
 26 | from os import path
 27 | from toolwrapper import ToolWrapper
 28 | import sys
 29 | 
 30 | 
 31 | class MosesTokenizer(ToolWrapper):
 32 |     """A module for interfacing with ``tokenizer.perl`` from Moses.
 33 | 
 34 |     This class communicates with tokenizer.perl process via pipes. When the
 35 |     MosesTokenizer object is no longer needed, the close() method should be
 36 |     called to free system resources. The class supports the context manager
 37 |     interface. If used in a with statement, the close() method is invoked
 38 |     automatically.
 39 | 
 40 |     >>> tokenize = MosesTokenizer('en')
 41 |     >>> tokenize('Hello World!')
 42 |     ['Hello', 'World', '!']
 43 |     """
 44 | 
 45 |     def __init__(self, lang="en", old_version=False, no_escape=False, extra=None):
 46 |         self.lang = lang
 47 |         program = path.join(
 48 |             path.dirname(__file__),
 49 |             "tokenizer-" + ("v1.0" if old_version else "v1.1") + ".perl"
 50 |         )
 51 |         argv = ["perl", program, "-q", "-l", self.lang]
 52 |         if no_escape:
 53 |             argv.append("-no-escape")
 54 |         if not old_version:
 55 |             # -b = disable output buffering
 56 |             # -a = aggressive hyphen splitting
 57 |             argv.extend(["-b", "-a"])
 58 |         if extra:
 59 |             argv.extend(extra)
 60 |         super().__init__(argv)
 61 | 
 62 |     def __str__(self):
 63 |         return "MosesTokenizer(lang=\"{lang}\")".format(lang=self.lang)
 64 | 
 65 |     def __call__(self, sentence):
 66 |         """Tokenizes a single sentence.
 67 | 
 68 |         Newline characters are not allowed in the sentence to be tokenized.
 69 |         """
 70 |         assert isinstance(sentence, str)
 71 |         sentence = sentence.rstrip("\n")
 72 |         assert "\n" not in sentence
 73 |         if not sentence:
 74 |             return []
 75 |         self.writeline(sentence)
 76 |         return self.readline().split()
 77 | 
 78 | 
 79 | def main():
 80 |     args = docopt(usage)
 81 |     if args["--selftest"]:
 82 |         import doctest
 83 |         import mosestokenizer.tokenizer
 84 |         doctest.testmod(mosestokenizer.tokenizer)
 85 |         if not args["<lang>"]:
 86 |             sys.exit(0)
 87 |     tokenize = MosesTokenizer(
 88 |         args["<lang>"],
 89 |         old_version=args["--old"],
 90 |         no_escape=args["--no-escape"],
 91 |     )
 92 |     inputfile = openfile(args["<inputfile>"])
 93 |     outputfile = openfile(args["<outputfile>"], "wt")
 94 |     with inputfile, outputfile:
 95 |         for line in inputfile:
 96 |             print(*tokenize(line), file=outputfile)
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | mosestokenizer
 2 | ==============
 3 | 
 4 | This package provides wrappers for some pre-processing Perl scripts from the
 5 | Moses toolkit, namely, ``normalize-punctuation.perl``, ``tokenizer.perl``,
 6 | ``detokenizer.perl`` and ``split-sentences.perl``.
 7 | 
 8 | Sample Usage
 9 | ------------
10 | 
11 | All provided classes are importable from the package ``mosestokenizer``.
12 | 
13 |     >>> from mosestokenizer import *
14 | 
15 | All classes have a constructor that takes a two-letter language code as
16 | argument (``'en'``, ``'fr'``, ``'de'``, etc) and the resulting objects
17 | are callable.
18 | 
19 | When created, these wrapper objects launch the corresponding Perl script as a
20 | background process.  When the objects are no longer needed, you should call the
21 | ``.close()`` method to close the background process and free system resources.
22 | 
23 | The objects also support the context manager interface.
24 | Thus, if used within a ``with`` block, the ``.close()`` method is invoked
25 | automatically when the block exits.
26 | 
27 | The following two usages of ``MosesTokenizer`` are equivalent:
28 | 
29 |     >>> # here we will call .close() explicitly at the end:
30 |     >>> tokenize = MosesTokenizer('en')
31 |     >>> tokenize('Hello World!')
32 |     ['Hello', 'World', '!']
33 |     >>> tokenize.close()
34 | 
35 |     >>> # here we take advantage of the context manager interface:
36 |     >>> with MosesTokenizer('en') as tokenize:
37 |     >>>     tokenize('Hello World!')
38 |     ...
39 |     ['Hello', 'World', '!']
40 | 
41 | As shown above, ``MosesTokenizer`` callable objects take a string and return a
42 | list of tokens (strings).
43 | 
44 | By contrast, ``MosesDetokenizer`` takes a list of tokens and returns a string:
45 | 
46 |     >>> with MosesDetokenizer('en') as detokenize:
47 |     >>>     detokenize(['Hello', 'World', '!'])
48 |     ...
49 |     'Hello World!'
50 | 
51 | ``MosesSentenceSplitter`` does more than the name says.  Besides splitting
52 | sentences, it will also unwrap text, i.e. it will try to guess if a sentence
53 | continues in the next line or not.  It takes a list of lines (strings) and
54 | returns a list of sentences (strings):
55 | 
56 |     >>> with MosesSentenceSplitter('en') as splitsents:
57 |     >>>     splitsents([
58 |     ...         'Mr. Smith is away.  Do you want to',
59 |     ...         'leave a message?'
60 |     ...     ])
61 |     ...
62 |     ['Mr. Smith is away.', 'Do you want to leave a message?']
63 | 
64 | 
65 | ``MosesPunctuationNormalizer`` objects take a string as argument and return a
66 | string:
67 | 
68 |     >>> with MosesPunctuationNormalizer('en') as normalize:
69 |     >>>     normalize('«Hello World» — she said…')
70 |     ...
71 |     '"Hello World" - she said...'
72 | 
73 | 
74 | License
75 | -------
76 | 
77 | Copyright ® 2016-2021, Luís Gomes <luismsgomes@gmail.com>.
78 | 
79 | This library is free software; you can redistribute it and/or
80 | modify it under the terms of the GNU Lesser General Public
81 | License as published by the Free Software Foundation; either
82 | version 2.1 of the License, or (at your option) any later version.
83 | 
84 | This library is distributed in the hope that it will be useful,
85 | but WITHOUT ANY WARRANTY; without even the implied warranty of
86 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
87 | Lesser General Public License for more details.
88 | 
89 | You should have received a copy of the GNU Lesser General Public
90 | License along with this library; if not, write to the Free Software
91 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
92 | 02110-1301 USA
93 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/src/mosestokenizer/tokenizer-v1.0.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # written by Josh Schroeder, based on code by Philipp Koehn
  3 | 
  4 | binmode(STDIN, ":utf8");
  5 | binmode(STDOUT, ":unix:utf8");
  6 | 
  7 | use strict;
  8 | 
  9 | use FindBin qw($RealBin);
 10 | 
 11 | my $mydir = "$RealBin/nonbreaking_prefixes";
 12 | 
 13 | my %NONBREAKING_PREFIX = ();
 14 | my $language = "en";
 15 | my $QUIET = 0;
 16 | my $HELP = 0;
 17 | 
 18 | while (@ARGV) {
 19 | 	$_ = shift;
 20 | 	/^-l$/ && ($language = shift, next);
 21 | 	/^-q$/ && ($QUIET = 1, next);
 22 | 	/^-h$/ && ($HELP = 1, next);
 23 | }
 24 | 
 25 | if ($HELP) {
 26 | 	print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
 27 | 	exit;
 28 | }
 29 | if (!$QUIET) {
 30 | 	print STDERR "Tokenizer v3\n";
 31 | 	print STDERR "Language: $language\n";
 32 | }
 33 | 
 34 | load_prefixes($language,\%NONBREAKING_PREFIX);
 35 | 
 36 | if (scalar(%NONBREAKING_PREFIX) eq 0){
 37 | 	print STDERR "Warning: No known abbreviations for language '$language'\n";
 38 | }
 39 | 
 40 | while(<STDIN>) {
 41 | 	if (/^<.+>$/ || /^\s*$/) {
 42 | 		#don't try to tokenize XML/HTML tag lines
 43 | 		print $_;
 44 | 	}
 45 | 	else {
 46 | 		print &tokenize($_);
 47 | 	}
 48 | }
 49 | 
 50 | 
 51 | sub tokenize {
 52 | 	my($text) = @_;
 53 | 	chomp($text);
 54 | 	$text = " $text ";
 55 | 	
 56 | 	# seperate out all "other" special characters
 57 | 	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
 58 | 	
 59 | 	#multi-dots stay together
 60 | 	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
 61 | 	while($text =~ /DOTMULTI\./) {
 62 | 		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
 63 | 		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
 64 | 	}
 65 | 
 66 | 	# seperate out "," except if within numbers (5,300)
 67 | 	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 68 | 	# separate , pre and post number
 69 | 	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 70 | 	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
 71 | 	      
 72 | 	# turn `into '
 73 | 	$text =~ s/\`/\'/g;
 74 | 	
 75 | 	#turn '' into "
 76 | 	$text =~ s/\'\'/ \" /g;
 77 | 
 78 | 	if ($language eq "en") {
 79 | 		#split contractions right
 80 | 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 81 | 		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
 82 | 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 83 | 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
 84 | 		#special case for "1990's"
 85 | 		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
 86 | 	} elsif (($language eq "fr") or ($language eq "it")) {
 87 | 		#split contractions left	
 88 | 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 89 | 		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
 90 | 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 91 | 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
 92 | 	} else {
 93 | 		$text =~ s/\'/ \' /g;
 94 | 	}
 95 | 	
 96 | 	#word token method
 97 | 	my @words = split(/\s/,$text);
 98 | 	$text = "";
 99 | 	for (my $i=0;$i<(scalar(@words));$i++) {
100 | 		my $word = $words[$i];
101 | 		if ( $word =~ /^(\S+)\.$/) {
102 | 			my $pre = $1;
103 | 			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
104 | 				#no change
105 | 			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
106 | 				#no change
107 | 			} else {
108 | 				$word = $pre." .";
109 | 			}
110 | 		}
111 | 		$text .= $word." ";
112 | 	}		
113 | 
114 | 	# clean up extraneous spaces
115 | 	$text =~ s/ +/ /g;
116 | 	$text =~ s/^ //g;
117 | 	$text =~ s/ $//g;
118 | 
119 | 	#restore multi-dots
120 | 	while($text =~ /DOTDOTMULTI/) {
121 | 		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
122 | 	}
123 | 	$text =~ s/DOTMULTI/./g;
124 | 	
125 | 	#ensure final line break
126 | 	$text .= "\n" unless $text =~ /\n$/;
127 | 
128 | 	return $text;
129 | }
130 | 
131 | sub load_prefixes {
132 | 	my ($language, $PREFIX_REF) = @_;
133 | 	
134 | 	my $prefixfile = "$mydir/nonbreaking_prefix.$language";
135 | 	
136 | 	#default back to English if we don't have a language-specific prefix file
137 | 	if (!(-e $prefixfile)) {
138 | 		$prefixfile = "$mydir/nonbreaking_prefix.en";
139 | 		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
140 | 		die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
141 | 	}
142 | 	
143 | 	if (-e "$prefixfile") {
144 | 		open(PREFIX, "<:utf8", "$prefixfile");
145 | 		while (<PREFIX>) {
146 | 			my $item = $_;
147 | 			chomp($item);
148 | 			if (($item) && (substr($item,0,1) ne "#")) {
149 | 				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
150 | 					$PREFIX_REF->{$1} = 2;
151 | 				} else {
152 | 					$PREFIX_REF->{$item} = 1;
153 | 				}
154 | 			}
155 | 		}
156 | 		close(PREFIX);
157 | 	}
158 | 	
159 | }
160 | 
161 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/split-sentences.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # Based on Preprocessor written by Philipp Koehn
  7 | 
  8 | binmode(STDIN, ":utf8");
  9 | binmode(STDOUT, ":utf8");
 10 | binmode(STDERR, ":utf8");
 11 | 
 12 | use warnings;
 13 | use FindBin qw($RealBin);
 14 | use strict;
 15 | 
 16 | my $mydir = "$RealBin/nonbreaking_prefixes";
 17 | 
 18 | my %NONBREAKING_PREFIX = ();
 19 | my $language = "en";
 20 | my $QUIET = 0;
 21 | my $HELP = 0;
 22 | my $MORE = 0;
 23 | 
 24 | while (@ARGV) {
 25 | 	$_ = shift;
 26 | 	/^-l$/ && ($language = shift, next);
 27 | 	/^-q$/ && ($QUIET = 1, next);
 28 | 	/^-h$/ && ($HELP = 1, next);
 29 | 	/^-m$/ && ($MORE = 1, next);
 30 | 	/^-b$/ && ($|++, next); # no output buffering
 31 | }
 32 | 
 33 | if ($HELP) {
 34 |     print "Usage ./split-sentences.perl (-l [en|de|...]) [-q] [-b] < textfile > splitfile\n";
 35 |     print "-q: quiet mode\n";
 36 |     print "-b: no output buffering (for use in bidirectional pipes)\n";
 37 |     exit;
 38 | }
 39 | if (!$QUIET) {
 40 | 	print STDERR "Sentence Splitter v3\n";
 41 | 	print STDERR "Language: $language\n";
 42 | }
 43 | 
 44 | my $prefixfile = "$mydir/nonbreaking_prefix.$language";
 45 | 
 46 | #default back to English if we don't have a language-specific prefix file
 47 | if (!(-e $prefixfile)) {
 48 | 	$prefixfile = "$mydir/nonbreaking_prefix.en";
 49 | 	print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
 50 | 	die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
 51 | }
 52 | 
 53 | if (-e "$prefixfile") {
 54 | 	open(PREFIX, "<:utf8", "$prefixfile") or die "Cannot open: $!";
 55 | 	while (<PREFIX>) {
 56 | 		my $item = $_;
 57 | 		chomp($item);
 58 | 		if (($item) && (substr($item,0,1) ne "#")) {
 59 | 			if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
 60 | 				$NONBREAKING_PREFIX{$1} = 2;
 61 | 			} else {
 62 | 				$NONBREAKING_PREFIX{$item} = 1;
 63 | 			}
 64 | 		}
 65 | 	}
 66 | 	close(PREFIX);
 67 | }
 68 | 
 69 | ##loop text, add lines together until we get a blank line or a <p>
 70 | my $text = "";
 71 | while(<STDIN>) {
 72 | 	chop;
 73 | 	if (/^<.+>$/ || /^\s*$/) {
 74 | 		#time to process this block, we've hit a blank or <p>
 75 | 		&do_it_for($text,$_);
 76 | 		print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
 77 | 		$text = "";
 78 | 	}
 79 | 	else {
 80 | 		#append the text, with a space
 81 | 		$text .= $_. " ";
 82 | 	}
 83 | }
 84 | #do the leftover text
 85 | &do_it_for($text,"") if $text;
 86 | 
 87 | 
 88 | sub do_it_for {
 89 | 	my($text,$markup) = @_;
 90 | 	print &preprocess($text) if $text;
 91 | 	print "$markup\n" if ($markup =~ /^<.+>$/);
 92 | 	#chop($text);
 93 | }
 94 | 
 95 | sub preprocess {
 96 | 	#this is one paragraph
 97 | 	my($text) = @_;
 98 | 
 99 | 	# clean up spaces at head and tail of each line as well as any double-spacing
100 | 	$text =~ s/ +/ /g;
101 | 	$text =~ s/\n /\n/g;
102 | 	$text =~ s/ \n/\n/g;
103 | 	$text =~ s/^ //g;
104 | 	$text =~ s/ $//g;
105 | 
106 | 	#####add sentence breaks as needed#####
107 | 
108 | 	if ($MORE) {
109 | 		#colon and semi-colon may be considered sentence breakers
110 | 		$text =~ s/([\:;])/$1\n/g;
111 | 	}
112 | 
113 | 	#non-period end of sentence markers (?!) followed by sentence starters.
114 | 	$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
115 | 
116 | 	#multi-dots followed by sentence starters
117 | 	$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
118 | 
119 | 	# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
120 | 	$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
121 | 
122 | 	# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
123 | 	$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
124 | 
125 | 	# special punctuation cases are covered. Check all remaining periods.
126 | 	my $word;
127 | 	my $i;
128 | 	my @words = split(/ /,$text);
129 | 	$text = "";
130 | 	for ($i=0;$i<(scalar(@words)-1);$i++) {
131 | 		if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
132 | 			#check if $1 is a known honorific and $2 is empty, never break
133 | 			my $prefix = $1;
134 | 			my $starting_punct = $2;
135 | 			if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
136 | 				#not breaking;
137 | 			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
138 | 				#not breaking - upper case acronym
139 | 			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
140 | 				#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
141 | 				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
142 | 				#we always add a return for these unless we have a numeric non-breaker and a number start
143 | 			}
144 | 
145 | 		}
146 | 		$text = $text.$words[$i]." ";
147 | 	}
148 | 
149 | 	#we stopped one token from the end to allow for easy look-ahead. Append it now.
150 | 	$text = $text.$words[$i];
151 | 
152 | 	# clean up spaces at head and tail of each line as well as any double-spacing
153 | 	$text =~ s/ +/ /g;
154 | 	$text =~ s/\n /\n/g;
155 | 	$text =~ s/ \n/\n/g;
156 | 	$text =~ s/^ //g;
157 | 	$text =~ s/ $//g;
158 | 
159 | 	#add trailing break
160 | 	$text .= "\n" unless $text =~ /\n$/;
161 | 
162 | 	return $text;
163 | 
164 | }
165 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/sentsplitter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A module for interfacing with ``split-sentences.perl`` from Moses toolkit.
  3 | 
  4 | Copyright ® 2016-2017, Luís Gomes <luismsgomes@gmail.com>
  5 | """
  6 | 
  7 | usage = """
  8 | Usage:
  9 |     moses-sentence-splitter [options] <lang> [<inputfile> [<outputfile>]]
 10 |     moses-sentence-splitter --selftest [--verbose]
 11 | 
 12 | Options:
 13 |     --selftest, -t  Run selftests.
 14 |     --verbose, -v   Be more verbose.
 15 |     --unwrap, -u    Assume that the text is wrapped and try to unwrap it.
 16 |                     Note that this option will cause all consecutive non-empty
 17 |                     lines to be buffered in memory.  If you give this option
 18 |                     make sure that you have empty lines separating paragraphs.
 19 |                     When this option is not given, each line is assumed to be
 20 |                     an independent paragraph or sentence and thus will not be
 21 |                     joined with other lines.
 22 |     --more          Also split on colons and semi-colons.
 23 |     --even-more     Also split on extra unicode characters.
 24 | 
 25 | 2016, Luís Gomes <luismsgomes@gmail.com>
 26 | """
 27 | 
 28 | 
 29 | import re
 30 | import sys
 31 | from os import path
 32 | 
 33 | from docopt import docopt
 34 | from openfile import openfile
 35 | 
 36 | from toolwrapper import ToolWrapper
 37 | from ucenum import ucenum
 38 | from ucinfo import ucinfo
 39 | 
 40 | 
 41 | UNICODE_TERMINATORS = "".join(
 42 |     c.printable for c in map(ucinfo, ucenum('P'))
 43 |     if c.printable != '.' and c.name.endswith("FULL STOP")
 44 |     or "INVERTED" not in c.name and
 45 |     ("QUESTION MARK" in c.name or "EXCLAMATION MARK" in c.name)
 46 | )
 47 | 
 48 | 
 49 | class MosesSentenceSplitter(ToolWrapper):
 50 |     """
 51 |     A class for interfacing with ``split-sentences.perl`` from Moses toolkit.
 52 | 
 53 |     This class communicates with split-sentences.perl process via pipes. When
 54 |     the MosesSentenceSplitter object is no longer needed, the close() method
 55 |     should be called to free system resources. The class supports the context
 56 |     manager interface. If used in a with statement, the close() method is
 57 |     invoked automatically.
 58 | 
 59 |     When attribute ``more`` is True, colons and semi-colons are considered
 60 |     sentence separators.
 61 | 
 62 |     When attribute ``even_more`` is True, all unicode full stop characters,
 63 |     exclamation marks and question marks are considered sentence separators.
 64 |     Note: this option is not available in the original Moses Tokenizer.
 65 | 
 66 |     >>> split_sents = MosesSentenceSplitter('en')
 67 |     >>> split_sents(['Hello World! Hello', 'again.'])
 68 |     ['Hello World!', 'Hello again.']
 69 | 
 70 |     """
 71 | 
 72 |     def __init__(self, lang="en", more=True, even_more=False):
 73 |         self.lang = lang
 74 |         program = path.join(
 75 |             path.dirname(__file__),
 76 |             "split-sentences.perl"
 77 |         )
 78 |         argv = ["perl", program, "-q", "-b", "-l", self.lang]
 79 |         if more:
 80 |             argv.append("-m")
 81 |         self.even_more = even_more
 82 |         super().__init__(argv)
 83 | 
 84 |     def __str__(self):
 85 |         return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang)
 86 | 
 87 |     def __call__(self, paragraph):
 88 |         """Splits sentences within a paragraph.
 89 |         The paragraph is a list of non-empty lines.  XML-like tags are not
 90 |          allowed.
 91 |         """
 92 |         assert isinstance(paragraph, (list, tuple))
 93 |         if not paragraph:  # empty paragraph is OK
 94 |             return []
 95 |         assert all(isinstance(line, str) for line in paragraph)
 96 |         paragraph = [line.strip() for line in paragraph]
 97 |         assert all(paragraph), "blank lines are not allowed"
 98 |         for line in paragraph:
 99 |             self.writeline(line)
100 |         self.writeline("<P>")
101 |         sentences = []
102 |         while True:
103 |             sentence = self.readline().strip()
104 |             if sentence == "<P>":
105 |                 break
106 |             sentences.append(sentence)
107 |         if self.even_more:
108 |             sentences = MosesSentenceSplitter._split_even_more(sentences)
109 |         return sentences
110 | 
111 |     @staticmethod
112 |     def _split_even_more(sentences):
113 |         result = []
114 |         for s in sentences:
115 |             parts = re.split(f'([{UNICODE_TERMINATORS}])', s)
116 |             if len(parts) == 1:
117 |                 result.append(s)
118 |             else:
119 |                 for new_s, term in zip(parts[:-1:2], parts[1::2]):
120 |                     result.append(new_s + term)
121 |         return result
122 | 
123 | 
124 | def read_paragraphs(inputfile, wrapped=True):
125 |     lines = map(str.strip, inputfile)
126 |     if wrapped:
127 |         paragraph = []
128 |         for line in lines:
129 |             if line:
130 |                 paragraph.append(line)
131 |             elif paragraph:
132 |                 yield paragraph
133 |                 paragraph = []
134 |         if paragraph:
135 |             yield paragraph
136 |     else:
137 |         for line in lines:
138 |             yield [line] if line else []
139 | 
140 | 
141 | def write_paragraphs(paragraphs, outputfile, blank_sep=True):
142 |     for paragraph in paragraphs:
143 |         for sentence in paragraph:
144 |             print(sentence, file=outputfile)
145 |         if blank_sep or not paragraph:
146 |             print(file=outputfile)  # paragraph separator
147 | 
148 | 
149 | def main():
150 |     args = docopt(usage)
151 |     if args["--selftest"]:
152 |         import doctest
153 |         import mosestokenizer.sentsplitter
154 |         doctest.testmod(mosestokenizer.sentsplitter)
155 |         if not args["<lang>"]:
156 |             sys.exit(0)
157 |     split_sents = MosesSentenceSplitter(args["<lang>"], more=args["--more"],
158 |         even_more=args["--even-more"])
159 |     inputfile = openfile(args["<inputfile>"])
160 |     outputfile = openfile(args["<outputfile>"], "wt")
161 |     with inputfile, outputfile:
162 |         paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"])
163 |         paragraphs = map(split_sents, paragraphs)
164 |         write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"])
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     main()
169 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/detokenizer.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
  4 | # Sample De-Tokenizer
  5 | # written by Josh Schroeder, based on code by Philipp Koehn
  6 | # further modifications by Ondrej Bojar
  7 | #
  8 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  9 | # Public License version 2.1 or, at your option, any later version.
 10 | 
 11 | binmode(STDIN, ":utf8");
 12 | binmode(STDOUT, ":utf8");
 13 | 
 14 | use warnings;
 15 | use strict;
 16 | use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
 17 | 
 18 | my $language = "en";
 19 | my $QUIET = 0;
 20 | my $HELP = 0;
 21 | my $UPPERCASE_SENT = 0;
 22 | my $PENN = 0;
 23 | 
 24 | while (@ARGV) {
 25 | 	$_ = shift;
 26 | 	/^-b$/ && ($| = 1, next);
 27 | 	/^-l$/ && ($language = shift, next);
 28 | 	/^-q$/ && ($QUIET = 1, next);
 29 | 	/^-h$/ && ($HELP = 1, next);
 30 | 	/^-u$/ && ($UPPERCASE_SENT = 1, next);
 31 |   /^-penn$/ && ($PENN = 1, next);
 32 | }
 33 | 
 34 | if ($HELP) {
 35 | 	print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
 36 |         print "Options:\n";
 37 |         print "  -u     ... uppercase the first char in the final sentence.\n";
 38 |         print "  -q     ... don't report detokenizer revision.\n";
 39 |         print "  -b     ... disable Perl buffering.\n";
 40 |         print "  -penn  ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
 41 | 	exit;
 42 | }
 43 | 
 44 | if ($language !~ /^(cs|en|fr|it|fi)$/) {
 45 |   print STDERR "Warning: No built-in rules for language $language.\n"
 46 | }
 47 | 
 48 | if ($PENN && $language ne "en") {
 49 |   print STDERR "Error: -penn option only supported for English text.\n";
 50 |   exit;
 51 | }
 52 | 
 53 | if (!$QUIET) {
 54 | 	print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
 55 | 	print STDERR "Language: $language\n";
 56 | }
 57 | 
 58 | while(<STDIN>) {
 59 | 	if (/^<.+>$/ || /^\s*$/) {
 60 | 		#don't try to detokenize XML/HTML tag lines
 61 | 		print $_;
 62 |   } elsif ($PENN) {
 63 |     print &detokenize_penn($_);
 64 |   } else {
 65 | 		print &detokenize($_);
 66 | 	}
 67 | }
 68 | 
 69 | 
 70 | sub ucsecondarg {
 71 |   # uppercase the second argument
 72 |   my $arg1 = shift;
 73 |   my $arg2 = shift;
 74 |   return $arg1.uc($arg2);
 75 | }
 76 | 
 77 | sub deescape {
 78 |   # de-escape special chars
 79 |   my ($text) = @_;
 80 |   $text =~ s/\&bar;/\|/g;   # factor separator (legacy)
 81 |   $text =~ s/\&#124;/\|/g;  # factor separator
 82 |   $text =~ s/\&lt;/\</g;    # xml
 83 |   $text =~ s/\&gt;/\>/g;    # xml
 84 |   $text =~ s/\&bra;/\[/g;   # syntax non-terminal (legacy)
 85 |   $text =~ s/\&ket;/\]/g;   # syntax non-terminal (legacy)
 86 |   $text =~ s/\&quot;/\"/g;  # xml
 87 |   $text =~ s/\&apos;/\'/g;  # xml
 88 |   $text =~ s/\&#91;/\[/g;   # syntax non-terminal
 89 |   $text =~ s/\&#93;/\]/g;   # syntax non-terminal
 90 |   $text =~ s/\&amp;/\&/g;   # escape escape
 91 |   return $text;
 92 | }
 93 | 
 94 | sub detokenize {
 95 | 	my($text) = @_;
 96 | 	chomp($text);
 97 | 	$text = " $text ";
 98 |   $text =~ s/ \@\-\@ /-/g;
 99 |   $text = &deescape($text);
100 | 
101 | 	my $word;
102 | 	my $i;
103 | 	my @words = split(/ /,$text);
104 | 	$text = "";
105 | 	my %quoteCount =  ("\'"=>0,"\""=>0);
106 | 	my $prependSpace = " ";
107 | 	for ($i=0;$i<(scalar(@words));$i++) {		
108 | 		if (&startsWithCJKChar($words[$i])) {
109 | 		    if ($i > 0 && &endsWithCJKChar($words[$i-1])) {
110 | 			# perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
111 | 			$text=$text.$words[$i];
112 | 		    } else {
113 | 			# ... but do nothing special if this is a CJK word that doesn't follow a CJK word
114 | 			$text=$text.$prependSpace.$words[$i];
115 | 		    }
116 | 		    $prependSpace = " ";
117 | 		} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
118 | 			#perform right shift on currency and other random punctuation items
119 | 			$text = $text.$prependSpace.$words[$i];
120 | 			$prependSpace = "";
121 | 		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
122 | 		    if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
123 | 			#these punctuations are prefixed with a non-breakable space in french
124 | 			$text .= " "; }
125 | 			#perform left shift on punctuation items
126 | 			$text=$text.$words[$i];
127 | 			$prependSpace = " ";
128 | 		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
129 | 			#left-shift the contraction for English
130 | 			$text=$text.$words[$i];
131 | 			$prependSpace = " ";
132 | 		} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
133 | 			#left-shift floats in Czech
134 | 			$text=$text.$words[$i];
135 | 			$prependSpace = " ";
136 | 		}  elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
137 | 			#right-shift the contraction for French and Italian
138 | 			$text = $text.$prependSpace.$words[$i];
139 | 			$prependSpace = "";
140 | 		} elsif (($language eq "cs") && ($i<(scalar(@words)-3))
141 | 				&& ($words[$i] =~ /[\p{IsAlpha}]$/)
142 | 				&& ($words[$i+1] =~ /^[-–]$/)
143 | 				&& ($words[$i+2] =~ /^li$|^mail.*/i)
144 | 				) {
145 | 			#right-shift "-li" in Czech and a few Czech dashed words (e-mail)
146 | 			$text = $text.$prependSpace.$words[$i].$words[$i+1];
147 | 			$i++; # advance over the dash
148 | 			$prependSpace = "";
149 | 		} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
150 | 			#combine punctuation smartly
151 |                         my $normalized_quo = $words[$i];
152 |                         $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
153 |                         $quoteCount{$normalized_quo} = 0
154 |                                 if !defined $quoteCount{$normalized_quo};
155 |                         if ($language eq "cs" && $words[$i] eq "„") {
156 |                           # this is always the starting quote in Czech
157 |                           $quoteCount{$normalized_quo} = 0;
158 |                         }
159 |                         if ($language eq "cs" && $words[$i] eq "“") {
160 |                           # this is usually the ending quote in Czech
161 |                           $quoteCount{$normalized_quo} = 1;
162 |                         }
163 | 			if (($quoteCount{$normalized_quo} % 2) eq 0) {
164 | 				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
165 | 					#single quote for posesssives ending in s... "The Jones' house"
166 | 					#left shift
167 | 					$text=$text.$words[$i];
168 | 					$prependSpace = " ";
169 | 				} else {
170 | 					#right shift
171 | 					$text = $text.$prependSpace.$words[$i];
172 | 					$prependSpace = "";
173 | 					$quoteCount{$normalized_quo} ++;
174 | 
175 | 				}
176 | 			} else {
177 | 				#left shift
178 | 				$text=$text.$words[$i];
179 | 				$prependSpace = " ";
180 | 				$quoteCount{$normalized_quo} ++;
181 | 
182 | 			}
183 | 			
184 |         } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
185 |             # Finnish : without intervening space if followed by case suffix
186 |             # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
187 |             $text=$text. lc $words[$i];
188 |             $prependSpace = " ";
189 | 		} else {
190 | 			$text=$text.$prependSpace.$words[$i];
191 | 			$prependSpace = " ";
192 | 		}
193 | 	}
194 | 	
195 | 	# clean up spaces at head and tail of each line as well as any double-spacing
196 | 	$text =~ s/ +/ /g;
197 | 	$text =~ s/\n /\n/g;
198 | 	$text =~ s/ \n/\n/g;
199 | 	$text =~ s/^ //g;
200 | 	$text =~ s/ $//g;
201 | 	
202 | 	#add trailing break
203 | 	$text .= "\n" unless $text =~ /\n$/;
204 | 
205 |         $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
206 | 
207 | 	return $text;
208 | }
209 | 
210 | sub detokenize_penn {
211 |   my($text) = @_;
212 | 
213 |   chomp($text);
214 |   $text = " $text ";
215 |   $text =~ s/ \@\-\@ /-/g;
216 |   $text =~ s/ \@\/\@ /\//g;
217 |   $text = &deescape($text);
218 | 
219 |   # merge de-contracted forms except where the second word begins with an
220 |   # apostrophe (those are handled later)
221 |   $text =~ s/ n't /n't /g;
222 |   $text =~ s/ N'T /N'T /g;
223 |   $text =~ s/ ([Cc])an not / $1annot /g;
224 |   $text =~ s/ ([Dd])' ye / $1'ye /g;
225 |   $text =~ s/ ([Gg])im me / $1imme /g;
226 |   $text =~ s/ ([Gg])on na / $1onna /g;
227 |   $text =~ s/ ([Gg])ot ta / $1otta /g;
228 |   $text =~ s/ ([Ll])em me / $1emme /g;
229 |   $text =~ s/ '([Tt]) is / '$1is /g;
230 |   $text =~ s/ '([Tt]) was / '$1was /g;
231 |   $text =~ s/ ([Ww])an na / $1anna /g;
232 | 
233 |   # restore brackets
234 |   $text =~ s/-LRB-/\(/g;
235 |   $text =~ s/-RRB-/\)/g;
236 |   $text =~ s/-LSB-/\[/g;
237 |   $text =~ s/-RSB-/\]/g;
238 |   $text =~ s/-LCB-/{/g;
239 |   $text =~ s/-RCB-/}/g;
240 | 
241 |   my $i;
242 |   my @words = split(/ /,$text);
243 |   $text = "";
244 |   my $prependSpace = " ";
245 |   for ($i=0;$i<(scalar(@words));$i++) {
246 |     if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
247 |       # perform right shift on currency and other random punctuation items
248 |       $text = $text.$prependSpace.$words[$i];
249 |       $prependSpace = "";
250 |     } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
251 |       # perform left shift on punctuation items
252 |       $text=$text.$words[$i];
253 |       $prependSpace = " ";
254 |     } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
255 |       # left-shift the contraction
256 |       $text=$text.$words[$i];
257 |       $prependSpace = " ";
258 |     } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
259 |       # opening single quote: convert to straight quote and right-shift
260 |       $text = $text.$prependSpace."\'";
261 |       $prependSpace = "";
262 |     } elsif ($words[$i] eq "``") {
263 |       # opening double quote: convert to straight quote and right-shift
264 |       $text = $text.$prependSpace."\"";
265 |       $prependSpace = "";
266 |     } elsif ($words[$i] eq "\'") {
267 |       # closing single quote: convert to straight quote and left shift
268 |       $text = $text."\'";
269 |       $prependSpace = " ";
270 |     } elsif ($words[$i] eq "\'\'") {
271 |       # closing double quote: convert to straight quote and left shift
272 |       $text = $text."\"";
273 |       $prependSpace = " ";
274 |     } else {
275 |       $text = $text.$prependSpace.$words[$i];
276 |       $prependSpace = " ";
277 |     }
278 |   }
279 | 
280 |   # clean up spaces at head and tail of each line as well as any double-spacing
281 |   $text =~ s/ +/ /g;
282 |   $text =~ s/\n /\n/g;
283 |   $text =~ s/ \n/\n/g;
284 |   $text =~ s/^ //g;
285 |   $text =~ s/ $//g;
286 | 
287 |   # add trailing break
288 |   $text .= "\n" unless $text =~ /\n$/;
289 | 
290 |   $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
291 | 
292 |   return $text;
293 | }
294 | 
295 | sub startsWithCJKChar {
296 |     my ($str) = @_;
297 |     return 0 if length($str) == 0;
298 |     my $firstChar = substr($str, 0, 1);
299 |     return &charIsCJK($firstChar);
300 | }
301 | 
302 | sub endsWithCJKChar {
303 |     my ($str) = @_;
304 |     return 0 if length($str) == 0;
305 |     my $lastChar = substr($str, length($str)-1, 1);
306 |     return &charIsCJK($lastChar);
307 | }
308 | 
309 | # Given a string consisting of one character, returns true iff the character
310 | # is a CJK (Chinese/Japanese/Korean) character
311 | sub charIsCJK {
312 |     my ($char) = @_;
313 |     # $char should be a string of length 1
314 |     my $codepoint = &codepoint_dec($char);
315 | 
316 |     # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
317 | 
318 |     # Hangul Jamo (1100–11FF)
319 |     return 1 if (&between_hexes($codepoint, '1100', '11FF'));
320 | 
321 |     # CJK Radicals Supplement (2E80–2EFF)
322 |     # Kangxi Radicals (2F00–2FDF)
323 |     # Ideographic Description Characters (2FF0–2FFF)
324 |     # CJK Symbols and Punctuation (3000–303F)
325 |     # Hiragana (3040–309F)
326 |     # Katakana (30A0–30FF)
327 |     # Bopomofo (3100–312F)
328 |     # Hangul Compatibility Jamo (3130–318F)
329 |     # Kanbun (3190–319F)
330 |     # Bopomofo Extended (31A0–31BF)
331 |     # CJK Strokes (31C0–31EF)
332 |     # Katakana Phonetic Extensions (31F0–31FF)
333 |     # Enclosed CJK Letters and Months (3200–32FF)
334 |     # CJK Compatibility (3300–33FF)
335 |     # CJK Unified Ideographs Extension A (3400–4DBF)
336 |     # Yijing Hexagram Symbols (4DC0–4DFF)
337 |     # CJK Unified Ideographs (4E00–9FFF)
338 |     # Yi Syllables (A000–A48F)
339 |     # Yi Radicals (A490–A4CF)
340 |     return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
341 | 
342 |     # Phags-pa (A840–A87F)
343 |     return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
344 | 
345 |     # Hangul Syllables (AC00–D7AF)
346 |     return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
347 | 
348 |     # CJK Compatibility Ideographs (F900–FAFF)
349 |     return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
350 | 
351 |     # CJK Compatibility Forms (FE30–FE4F)
352 |     return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
353 | 
354 |     # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
355 |     return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
356 | 
357 |     # Supplementary Ideographic Plane 20000–2FFFF
358 |     return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
359 | 
360 |     return 0;
361 | }
362 | 
363 | # Returns the code point of a Unicode char, represented as a decimal number
364 | sub codepoint_dec {
365 |     if (my $char = shift) {
366 | 	return unpack('U0U*', $char);
367 |     }
368 | }
369 | 
370 | sub between_hexes {
371 |     my ($num, $left, $right) = @_;
372 |     return $num >= hex($left) && $num <= hex($right);
373 | }
374 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/tokenizer-v1.1.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | # Sample Tokenizer
  4 | ### Version 1.1
  5 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
  6 | # Version 1.1 updates:
  7 | #       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
  8 | #       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
  9 | #       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
 10 | ### Version 1.0
 11 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
 12 | # written by Josh Schroeder, based on code by Philipp Koehn
 13 | 
 14 | binmode(STDIN, ":utf8");
 15 | binmode(STDOUT, ":utf8");
 16 | 
 17 | use FindBin qw($RealBin);
 18 | use strict;
 19 | use Time::HiRes;
 20 | use Thread;
 21 | 
 22 | my $mydir = "$RealBin/nonbreaking_prefixes";
 23 | 
 24 | my %NONBREAKING_PREFIX = ();
 25 | my @protected_patterns = ();
 26 | my $protected_patterns_file = "";
 27 | my $language = "en";
 28 | my $QUIET = 0;
 29 | my $HELP = 0;
 30 | my $AGGRESSIVE = 0;
 31 | my $SKIP_XML = 0;
 32 | my $TIMING = 0;
 33 | my $NUM_THREADS = 1;
 34 | my $NUM_SENTENCES_PER_THREAD = 2000;
 35 | my $PENN = 0;
 36 | my $NO_ESCAPING = 0;
 37 | while (@ARGV)
 38 | {
 39 | 	$_ = shift;
 40 | 	/^-b$/ && ($| = 1, next);
 41 | 	/^-l$/ && ($language = shift, next);
 42 | 	/^-q$/ && ($QUIET = 1, next);
 43 | 	/^-h$/ && ($HELP = 1, next);
 44 | 	/^-x$/ && ($SKIP_XML = 1, next);
 45 | 	/^-a$/ && ($AGGRESSIVE = 1, next);
 46 | 	/^-time$/ && ($TIMING = 1, next);
 47 |   # Option to add list of regexps to be protected
 48 |   /^-protected/ && ($protected_patterns_file = shift, next);
 49 | 	/^-threads$/ && ($NUM_THREADS = int(shift), next);
 50 | 	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
 51 | 	/^-penn$/ && ($PENN = 1, next);
 52 | 	/^-no-escape/ && ($NO_ESCAPING = 1, next);
 53 | }
 54 | 
 55 | # for time calculation
 56 | my $start_time;
 57 | if ($TIMING)
 58 | {
 59 |     $start_time = [ Time::HiRes::gettimeofday( ) ];
 60 | }
 61 | 
 62 | # print help message
 63 | if ($HELP)
 64 | {
 65 | 	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
 66 |         print "Options:\n";
 67 |         print "  -q     ... quiet.\n";
 68 |         print "  -a     ... aggressive hyphen splitting.\n";
 69 |         print "  -b     ... disable Perl buffering.\n";
 70 |         print "  -time  ... enable processing time calculation.\n";
 71 |         print "  -penn  ... use Penn treebank-like tokenization.\n";
 72 |         print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
 73 | 	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
 74 | 	exit;
 75 | }
 76 | 
 77 | if (!$QUIET)
 78 | {
 79 | 	print STDERR "Tokenizer Version 1.1\n";
 80 | 	print STDERR "Language: $language\n";
 81 | 	print STDERR "Number of threads: $NUM_THREADS\n";
 82 | }
 83 | 
 84 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
 85 | load_prefixes($language,\%NONBREAKING_PREFIX);
 86 | 
 87 | if (scalar(%NONBREAKING_PREFIX) eq 0)
 88 | {
 89 | 	print STDERR "Warning: No known abbreviations for language '$language'\n";
 90 | }
 91 | 
 92 | # Load protected patterns
 93 | if ($protected_patterns_file)
 94 | {
 95 |   open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
 96 |   while(<PP>) {
 97 |     chomp;
 98 |     push @protected_patterns, $_;
 99 |   }
100 | }
101 | 
102 | my @batch_sentences = ();
103 | my @thread_list = ();
104 | my $count_sentences = 0;
105 | 
106 | if ($NUM_THREADS > 1)
107 | {# multi-threading tokenization
108 |     while(<STDIN>)
109 |     {
110 |         $count_sentences = $count_sentences + 1;
111 |         push(@batch_sentences, $_);
112 |         if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
113 |         {
114 |             # assign each thread work
115 |             for (my $i=0; $i<$NUM_THREADS; $i++)
116 |             {
117 |                 my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
118 |                 my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
119 |                 my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
120 |                 my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
121 |                 push(@thread_list, $new_thread);
122 |             }
123 |             foreach (@thread_list)
124 |             {
125 |                 my $tokenized_list = $_->join;
126 |                 foreach (@$tokenized_list)
127 |                 {
128 |                     print $_;
129 |                 }
130 |             }
131 |             # reset for the new run
132 |             @thread_list = ();
133 |             @batch_sentences = ();
134 |         }
135 |     }
136 |     # the last batch
137 |     if (scalar(@batch_sentences)>0)
138 |     {
139 |         # assign each thread work
140 |         for (my $i=0; $i<$NUM_THREADS; $i++)
141 |         {
142 |             my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
143 |             if ($start_index >= scalar(@batch_sentences))
144 |             {
145 |                 last;
146 |             }
147 |             my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
148 |             if ($end_index >= scalar(@batch_sentences))
149 |             {
150 |                 $end_index = scalar(@batch_sentences)-1;
151 |             }
152 |             my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
153 |             my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
154 |             push(@thread_list, $new_thread);
155 |         }
156 |         foreach (@thread_list)
157 |         {
158 |             my $tokenized_list = $_->join;
159 |             foreach (@$tokenized_list)
160 |             {
161 |                 print $_;
162 |             }
163 |         }
164 |     }
165 | }
166 | else
167 | {# single thread only
168 |     while(<STDIN>)
169 |     {
170 |         if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
171 |         {
172 |             #don't try to tokenize XML/HTML tag lines
173 |             print $_;
174 |         }
175 |         else
176 |         {
177 |             print &tokenize($_);
178 |         }
179 |     }
180 | }
181 | 
182 | if ($TIMING)
183 | {
184 |     my $duration = Time::HiRes::tv_interval( $start_time );
185 |     print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
186 |     print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
187 | }
188 | 
189 | #####################################################################################
190 | # subroutines afterward
191 | 
192 | # tokenize a batch of texts saved in an array
193 | # input: an array containing a batch of texts
194 | # return: another array containing a batch of tokenized texts for the input array
195 | sub tokenize_batch
196 | {
197 |     my(@text_list) = @_;
198 |     my(@tokenized_list) = ();
199 |     foreach (@text_list)
200 |     {
201 |         if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
202 |         {
203 |             #don't try to tokenize XML/HTML tag lines
204 |             push(@tokenized_list, $_);
205 |         }
206 |         else
207 |         {
208 |             push(@tokenized_list, &tokenize($_));
209 |         }
210 |     }
211 |     return \@tokenized_list;
212 | }
213 | 
214 | # the actual tokenize function which tokenizes one input string
215 | # input: one string
216 | # return: the tokenized string for the input string
217 | sub tokenize
218 | {
219 |     my($text) = @_;
220 | 
221 |     if ($PENN) {
222 |       return tokenize_penn($text);
223 |     }
224 | 
225 |     chomp($text);
226 |     $text = " $text ";
227 | 
228 |     # remove ASCII junk
229 |     $text =~ s/\s+/ /g;
230 |     $text =~ s/[\000-\037]//g;
231 | 
232 |     # Find protected patterns
233 |     my @protected = ();
234 |     foreach my $protected_pattern (@protected_patterns) {
235 |       my $t = $text;
236 |       while ($t =~ /($protected_pattern)(.*)$/) {
237 |         push @protected, $1;
238 |         $t = $2;
239 |       }
240 |     }
241 | 
242 |     for (my $i = 0; $i < scalar(@protected); ++$i) {
243 |       my $subst = sprintf("THISISPROTECTED%.3d", $i);
244 |       $text =~ s,\Q$protected[$i], $subst ,g;
245 |     }
246 |     $text =~ s/ +/ /g;
247 |     $text =~ s/^ //g;
248 |     $text =~ s/ $//g;
249 | 
250 |     # seperate out all "other" special characters
251 |     $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
252 | 
253 |     # aggressive hyphen splitting
254 |     if ($AGGRESSIVE)
255 |     {
256 |         $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
257 |     }
258 | 
259 |     #multi-dots stay together
260 |     $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
261 |     while($text =~ /DOTMULTI\./)
262 |     {
263 |         $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
264 |         $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
265 |     }
266 | 
267 |     # seperate out "," except if within numbers (5,300)
268 |     #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
269 | 
270 |     # separate out "," except if within numbers (5,300)
271 |     # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
272 |     # first application uses up B so rule can't see B,C
273 |     # two-step version here may create extra spaces but these are removed later
274 |     # will also space digit,letter or letter,digit forms (redundant with next section)
275 |     $text =~ s/([^\p{IsN}])[,]/$1 , /g;
276 |     $text =~ s/[,]([^\p{IsN}])/ , $1/g;
277 | 
278 |     # separate , pre and post number
279 |     #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
280 |     #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
281 | 
282 |     # turn `into '
283 |     #$text =~ s/\`/\'/g;
284 | 
285 |     #turn '' into "
286 |     #$text =~ s/\'\'/ \" /g;
287 | 
288 |     if ($language eq "en")
289 |     {
290 |         #split contractions right
291 |         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
292 |         $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
293 |         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
294 |         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
295 |         #special case for "1990's"
296 |         $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
297 |     }
298 |     elsif (($language eq "fr") or ($language eq "it"))
299 |     {
300 |         #split contractions left
301 |         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
302 |         $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
303 |         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
304 |         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
305 |     }
306 |     else
307 |     {
308 |         $text =~ s/\'/ \' /g;
309 |     }
310 | 
311 |     #word token method
312 |     my @words = split(/\s/,$text);
313 |     $text = "";
314 |     for (my $i=0;$i<(scalar(@words));$i++)
315 |     {
316 |         my $word = $words[$i];
317 |         if ( $word =~ /^(\S+)\.$/)
318 |         {
319 |             my $pre = $1;
320 |             if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
321 |             {
322 |                 #no change
323 | 			}
324 |             elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
325 |             {
326 |                 #no change
327 |             }
328 |             else
329 |             {
330 |                 $word = $pre." .";
331 |             }
332 |         }
333 |         $text .= $word." ";
334 |     }
335 | 
336 |     # clean up extraneous spaces
337 |     $text =~ s/ +/ /g;
338 |     $text =~ s/^ //g;
339 |     $text =~ s/ $//g;
340 | 
341 |     # restore protected
342 |     for (my $i = 0; $i < scalar(@protected); ++$i) {
343 |       my $subst = sprintf("THISISPROTECTED%.3d", $i);
344 |       $text =~ s/$subst/$protected[$i]/g;
345 |     }
346 | 
347 |     #restore multi-dots
348 |     while($text =~ /DOTDOTMULTI/)
349 |     {
350 |         $text =~ s/DOTDOTMULTI/DOTMULTI./g;
351 |     }
352 |     $text =~ s/DOTMULTI/./g;
353 | 
354 |     #escape special chars
355 |     if (!$NO_ESCAPING)
356 |       {
357 | 	$text =~ s/\&/\&amp;/g;   # escape escape
358 | 	$text =~ s/\|/\&#124;/g;  # factor separator
359 | 	$text =~ s/\</\&lt;/g;    # xml
360 | 	$text =~ s/\>/\&gt;/g;    # xml
361 | 	$text =~ s/\'/\&apos;/g;  # xml
362 | 	$text =~ s/\"/\&quot;/g;  # xml
363 | 	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
364 | 	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
365 |       }
366 | 
367 |     #ensure final line break
368 |     $text .= "\n" unless $text =~ /\n$/;
369 | 
370 |     return $text;
371 | }
372 | 
373 | sub tokenize_penn
374 | {
375 |     # Improved compatibility with Penn Treebank tokenization.  Useful if
376 |     # the text is to later be parsed with a PTB-trained parser.
377 |     #
378 |     # Adapted from Robert MacIntyre's sed script:
379 |     #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
380 | 
381 |     my($text) = @_;
382 |     chomp($text);
383 | 
384 |     # remove ASCII junk
385 |     $text =~ s/\s+/ /g;
386 |     $text =~ s/[\000-\037]//g;
387 | 
388 |     # attempt to get correct directional quotes
389 |     $text =~ s/^``/`` /g;
390 |     $text =~ s/^"/`` /g;
391 |     $text =~ s/^`([^`])/` $1/g;
392 |     $text =~ s/^'/`  /g;
393 |     $text =~ s/([ ([{<])"/$1 `` /g;
394 |     $text =~ s/([ ([{<])``/$1 `` /g;
395 |     $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
396 |     $text =~ s/([ ([{<])'/$1 ` /g;
397 |     # close quotes handled at end
398 | 
399 |     $text =~ s=\.\.\.= _ELLIPSIS_ =g;
400 | 
401 |     # separate out "," except if within numbers (5,300)
402 |     $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
403 |     # separate , pre and post number
404 |     $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
405 |     $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
406 | 
407 |     #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
408 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
409 | 
410 |     # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
411 |     # the tokens should be merged prior to parsing with a PTB-trained parser
412 |     # (see syntax-hyphen-splitting.perl).
413 |     $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
414 | 
415 |     # Assume sentence tokenization has been done first, so split FINAL periods
416 |     # only.
417 |     $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
418 |     # however, we may as well split ALL question marks and exclamation points,
419 |     # since they shouldn't have the abbrev.-marker ambiguity problem
420 |     $text =~ s=([?!])= $1 =g;
421 | 
422 |     # parentheses, brackets, etc.
423 |     $text =~ s=([\]\[\(\){}<>])= $1 =g;
424 |     $text =~ s/\(/-LRB-/g;
425 |     $text =~ s/\)/-RRB-/g;
426 |     $text =~ s/\[/-LSB-/g;
427 |     $text =~ s/\]/-RSB-/g;
428 |     $text =~ s/{/-LCB-/g;
429 |     $text =~ s/}/-RCB-/g;
430 | 
431 |     $text =~ s=--= -- =g;
432 | 
433 |     # First off, add a space to the beginning and end of each line, to reduce
434 |     # necessary number of regexps.
435 |     $text =~ s=$= =;
436 |     $text =~ s=^= =;
437 | 
438 |     $text =~ s="= '' =g;
439 |     # possessive or close-single-quote
440 |     $text =~ s=([^'])' =$1 ' =g;
441 |     # as in it's, I'm, we'd
442 |     $text =~ s='([sSmMdD]) = '$1 =g;
443 |     $text =~ s='ll = 'll =g;
444 |     $text =~ s='re = 're =g;
445 |     $text =~ s='ve = 've =g;
446 |     $text =~ s=n't = n't =g;
447 |     $text =~ s='LL = 'LL =g;
448 |     $text =~ s='RE = 'RE =g;
449 |     $text =~ s='VE = 'VE =g;
450 |     $text =~ s=N'T = N'T =g;
451 | 
452 |     $text =~ s= ([Cc])annot = $1an not =g;
453 |     $text =~ s= ([Dd])'ye = $1' ye =g;
454 |     $text =~ s= ([Gg])imme = $1im me =g;
455 |     $text =~ s= ([Gg])onna = $1on na =g;
456 |     $text =~ s= ([Gg])otta = $1ot ta =g;
457 |     $text =~ s= ([Ll])emme = $1em me =g;
458 |     $text =~ s= ([Mm])ore'n = $1ore 'n =g;
459 |     $text =~ s= '([Tt])is = '$1 is =g;
460 |     $text =~ s= '([Tt])was = '$1 was =g;
461 |     $text =~ s= ([Ww])anna = $1an na =g;
462 | 
463 |     #word token method
464 |     my @words = split(/\s/,$text);
465 |     $text = "";
466 |     for (my $i=0;$i<(scalar(@words));$i++)
467 |     {
468 |         my $word = $words[$i];
469 |         if ( $word =~ /^(\S+)\.$/)
470 |         {
471 |             my $pre = $1;
472 |             if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
473 |             {
474 |                 #no change
475 |             }
476 |             elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
477 |             {
478 |                 #no change
479 |             }
480 |             else
481 |             {
482 |                 $word = $pre." .";
483 |             }
484 |         }
485 |         $text .= $word." ";
486 |     }
487 | 
488 |     # restore ellipses
489 |     $text =~ s=_ELLIPSIS_=\.\.\.=g;
490 | 
491 |     # clean out extra spaces
492 |     $text =~ s=  *= =g;
493 |     $text =~ s=^ *==g;
494 |     $text =~ s= *$==g;
495 | 
496 |     #escape special chars
497 |     $text =~ s/\&/\&amp;/g;   # escape escape
498 |     $text =~ s/\|/\&#124;/g;  # factor separator
499 |     $text =~ s/\</\&lt;/g;    # xml
500 |     $text =~ s/\>/\&gt;/g;    # xml
501 |     $text =~ s/\'/\&apos;/g;  # xml
502 |     $text =~ s/\"/\&quot;/g;  # xml
503 |     $text =~ s/\[/\&#91;/g;   # syntax non-terminal
504 |     $text =~ s/\]/\&#93;/g;   # syntax non-terminal
505 | 
506 |     #ensure final line break
507 |     $text .= "\n" unless $text =~ /\n$/;
508 | 
509 |     return $text;
510 | }
511 | 
512 | sub load_prefixes
513 | {
514 |     my ($language, $PREFIX_REF) = @_;
515 | 
516 |     my $prefixfile = "$mydir/nonbreaking_prefix.$language";
517 | 
518 |     #default back to English if we don't have a language-specific prefix file
519 |     if (!(-e $prefixfile))
520 |     {
521 |         $prefixfile = "$mydir/nonbreaking_prefix.en";
522 |         print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
523 |         die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
524 |     }
525 | 
526 |     if (-e "$prefixfile")
527 |     {
528 |         open(PREFIX, "<:utf8", "$prefixfile");
529 |         while (<PREFIX>)
530 |         {
531 |             my $item = $_;
532 |             chomp($item);
533 |             if (($item) && (substr($item,0,1) ne "#"))
534 |             {
535 |                 if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
536 |                 {
537 |                     $PREFIX_REF->{$1} = 2;
538 |                 }
539 |                 else
540 |                 {
541 |                     $PREFIX_REF->{$item} = 1;
542 |                 }
543 |             }
544 |         }
545 |         close(PREFIX);
546 |     }
547 | }
548 | 


--------------------------------------------------------------------------------
/src/mosestokenizer/nonbreaking_prefixes/nonbreaking_prefix.el:
--------------------------------------------------------------------------------
   1 | # Sigle letters in upper-case are usually abbreviations of names
   2 | Α
   3 | Β
   4 | Γ
   5 | Δ
   6 | Ε
   7 | Ζ
   8 | Η
   9 | Θ
  10 | Ι
  11 | Κ
  12 | Λ
  13 | Μ
  14 | Ν
  15 | Ξ
  16 | Ο
  17 | Π
  18 | Ρ
  19 | Σ
  20 | Τ
  21 | Υ
  22 | Φ
  23 | Χ
  24 | Ψ
  25 | Ω
  26 | 
  27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content).
  28 | Άθαν
  29 | Έγχρ
  30 | Έκθ
  31 | Έσδ
  32 | Έφ
  33 | Όμ
  34 | Α΄Έσδρ
  35 | Α΄Έσδ
  36 | Α΄Βασ
  37 | Α΄Θεσ
  38 | Α΄Ιω
  39 | Α΄Κορινθ
  40 | Α΄Κορ
  41 | Α΄Μακκ
  42 | Α΄Μακ
  43 | Α΄Πέτρ
  44 | Α΄Πέτ
  45 | Α΄Παραλ
  46 | Α΄Πε
  47 | Α΄Σαμ
  48 | Α΄Τιμ
  49 | Α΄Χρον
  50 | Α΄Χρ
  51 | Α.Β.Α
  52 | Α.Β
  53 | Α.Ε
  54 | Α.Κ.Τ.Ο
  55 | Αέθλ
  56 | Αέτ
  57 | Αίλ.Δ
  58 | Αίλ.Τακτ
  59 | Αίσ
  60 | Αββακ
  61 | Αβυδ
  62 | Αβ
  63 | Αγάκλ
  64 | Αγάπ
  65 | Αγάπ.Αμαρτ.Σ
  66 | Αγάπ.Γεωπ
  67 | Αγαθάγγ
  68 | Αγαθήμ
  69 | Αγαθιν
  70 | Αγαθοκλ
  71 | Αγαθρχ
  72 | Αγαθ
  73 | Αγαθ.Ιστ
  74 | Αγαλλ
  75 | Αγαπητ
  76 | Αγγ
  77 | Αγησ
  78 | Αγλ
  79 | Αγορ.Κ
  80 | Αγρο.Κωδ
  81 | Αγρ.Εξ
  82 | Αγρ.Κ
  83 | Αγ.Γρ
  84 | Αδριαν
  85 | Αδρ
  86 | Αετ
  87 | Αθάν
  88 | Αθήν
  89 | Αθήν.Επιγρ
  90 | Αθήν.Επιτ
  91 | Αθήν.Ιατρ
  92 | Αθήν.Μηχ
  93 | Αθανάσ
  94 | Αθαν
  95 | Αθηνί
  96 | Αθηναγ
  97 | Αθηνόδ
  98 | Αθ
  99 | Αθ.Αρχ
 100 | Αιλ
 101 | Αιλ.Επιστ
 102 | Αιλ.ΖΙ
 103 | Αιλ.ΠΙ
 104 | Αιλ.απ
 105 | Αιμιλ
 106 | Αιν.Γαζ
 107 | Αιν.Τακτ
 108 | Αισχίν
 109 | Αισχίν.Επιστ
 110 | Αισχ
 111 | Αισχ.Αγαμ
 112 | Αισχ.Αγ
 113 | Αισχ.Αλ
 114 | Αισχ.Ελεγ
 115 | Αισχ.Επτ.Θ
 116 | Αισχ.Ευμ
 117 | Αισχ.Ικέτ
 118 | Αισχ.Ικ
 119 | Αισχ.Περσ
 120 | Αισχ.Προμ.Δεσμ
 121 | Αισχ.Πρ
 122 | Αισχ.Χοηφ
 123 | Αισχ.Χο
 124 | Αισχ.απ
 125 | ΑιτΕ
 126 | Αιτ
 127 | Αλκ
 128 | Αλχιας
 129 | Αμ.Π.Ο
 130 | Αμβ
 131 | Αμμών
 132 | Αμ.
 133 | Αν.Πειθ.Συμβ.Δικ
 134 | Ανακρ
 135 | Ανακ
 136 | Αναμν.Τόμ
 137 | Αναπλ
 138 | Ανδ
 139 | Ανθλγος
 140 | Ανθστης
 141 | Αντισθ
 142 | Ανχης
 143 | Αν
 144 | Αποκ
 145 | Απρ
 146 | Απόδ
 147 | Απόφ
 148 | Απόφ.Νομ
 149 | Απ
 150 | Απ.Δαπ
 151 | Απ.Διατ
 152 | Απ.Επιστ
 153 | Αριθ
 154 | Αριστοτ
 155 | Αριστοφ
 156 | Αριστοφ.Όρν
 157 | Αριστοφ.Αχ
 158 | Αριστοφ.Βάτρ
 159 | Αριστοφ.Ειρ
 160 | Αριστοφ.Εκκλ
 161 | Αριστοφ.Θεσμ
 162 | Αριστοφ.Ιππ
 163 | Αριστοφ.Λυσ
 164 | Αριστοφ.Νεφ
 165 | Αριστοφ.Πλ
 166 | Αριστοφ.Σφ
 167 | Αριστ
 168 | Αριστ.Αθ.Πολ
 169 | Αριστ.Αισθ
 170 | Αριστ.Αν.Πρ
 171 | Αριστ.Ζ.Ι
 172 | Αριστ.Ηθ.Ευδ
 173 | Αριστ.Ηθ.Νικ
 174 | Αριστ.Κατ
 175 | Αριστ.Μετ
 176 | Αριστ.Πολ
 177 | Αριστ.Φυσιογν
 178 | Αριστ.Φυσ
 179 | Αριστ.Ψυχ
 180 | Αριστ.Ρητ
 181 | Αρμεν
 182 | Αρμ
 183 | Αρχ.Εκ.Καν.Δ
 184 | Αρχ.Ευβ.Μελ
 185 | Αρχ.Ιδ.Δ
 186 | Αρχ.Νομ
 187 | Αρχ.Ν
 188 | Αρχ.Π.Ε
 189 | Αρ
 190 | Αρ.Φορ.Μητρ
 191 | Ασμ
 192 | Ασμ.ασμ
 193 | Αστ.Δ
 194 | Αστ.Χρον
 195 | Ασ
 196 | Ατομ.Γνωμ
 197 | Αυγ
 198 | Αφρ
 199 | Αχ.Νομ
 200 | Α
 201 | Α.Εγχ.Π
 202 | Α.Κ.΄Υδρας
 203 | Β΄Έσδρ
 204 | Β΄Έσδ
 205 | Β΄Βασ
 206 | Β΄Θεσ
 207 | Β΄Ιω
 208 | Β΄Κορινθ
 209 | Β΄Κορ
 210 | Β΄Μακκ
 211 | Β΄Μακ
 212 | Β΄Πέτρ
 213 | Β΄Πέτ
 214 | Β΄Πέ
 215 | Β΄Παραλ
 216 | Β΄Σαμ
 217 | Β΄Τιμ
 218 | Β΄Χρον
 219 | Β΄Χρ
 220 | Β.Ι.Π.Ε
 221 | Β.Κ.Τ
 222 | Β.Κ.Ψ.Β
 223 | Β.Μ
 224 | Β.Ο.Α.Κ
 225 | Β.Ο.Α
 226 | Β.Ο.Δ
 227 | Βίβλ
 228 | Βαρ
 229 | ΒεΘ
 230 | Βι.Περ
 231 | Βιπερ
 232 | Βιργ
 233 | Βλγ
 234 | Βούλ
 235 | Βρ
 236 | Γ΄Βασ
 237 | Γ΄Μακκ
 238 | ΓΕΝμλ
 239 | Γέν
 240 | Γαλ
 241 | Γεν
 242 | Γλ
 243 | Γν.Ν.Σ.Κρ
 244 | Γνωμ
 245 | Γν
 246 | Γράμμ
 247 | Γρηγ.Ναζ
 248 | Γρηγ.Νύσ
 249 | Γ Νοσ
 250 | Γ' Ογκολ
 251 | Γ.Ν
 252 | Δ΄Βασ
 253 | Δ.Β
 254 | Δ.Δίκη
 255 | Δ.Δίκ
 256 | Δ.Ε.Σ
 257 | Δ.Ε.Φ.Α
 258 | Δ.Ε.Φ
 259 | Δ.Εργ.Ν
 260 | Δαμ
 261 | Δαμ.μνημ.έργ
 262 | Δαν
 263 | Δασ.Κ
 264 | Δεκ
 265 | Δελτ.Δικ.Ε.Τ.Ε
 266 | Δελτ.Νομ
 267 | Δελτ.Συνδ.Α.Ε
 268 | Δερμ
 269 | Δευτ
 270 | Δεύτ
 271 | Δημοσθ
 272 | Δημόκρ
 273 | Δι.Δικ
 274 | Διάτ
 275 | Διαιτ.Απ
 276 | Διαιτ
 277 | Διαρκ.Στρατ
 278 | Δικ
 279 | Διοίκ.Πρωτ
 280 | ΔιοικΔνη
 281 | Διοικ.Εφ
 282 | Διον.Αρ
 283 | Διόρθ.Λαθ
 284 | Δ.κ.Π
 285 | Δνη
 286 | Δν
 287 | Δογμ.Όρος
 288 | Δρ
 289 | Δ.τ.Α
 290 | Δτ
 291 | ΔωδΝομ
 292 | Δ.Περ
 293 | Δ.Στρ
 294 | ΕΔΠολ
 295 | ΕΕυρΚ
 296 | ΕΙΣ
 297 | ΕΝαυτΔ
 298 | ΕΣΑμΕΑ
 299 | ΕΣΘ
 300 | ΕΣυγκΔ
 301 | ΕΤρΑξΧρΔ
 302 | Ε.Φ.Ε.Τ
 303 | Ε.Φ.Ι
 304 | Ε.Φ.Ο.Επ.Α
 305 | Εβδ
 306 | Εβρ
 307 | Εγκύκλ.Επιστ
 308 | Εγκ
 309 | Εε.Αιγ
 310 | Εθν.Κ.Τ
 311 | Εθν
 312 | Ειδ.Δικ.Αγ.Κακ
 313 | Εικ
 314 | Ειρ.Αθ
 315 | Ειρην.Αθ
 316 | Ειρην
 317 | Έλεγχ
 318 | Ειρ
 319 | Εισ.Α.Π
 320 | Εισ.Ε
 321 | Εισ.Ν.Α.Κ
 322 | Εισ.Ν.Κ.Πολ.Δ
 323 | Εισ.Πρωτ
 324 | Εισηγ.Έκθ
 325 | Εισ
 326 | Εκκλ
 327 | Εκκ
 328 | Εκ
 329 | Ελλ.Δνη
 330 | Εν.Ε
 331 | Εξ
 332 | Επ.Αν
 333 | Επ.Εργ.Δ
 334 | Επ.Εφ
 335 | Επ.Κυπ.Δ
 336 | Επ.Μεσ.Αρχ
 337 | Επ.Νομ
 338 | Επίκτ
 339 | Επίκ
 340 | Επι.Δ.Ε
 341 | Επιθ.Ναυτ.Δικ
 342 | Επικ
 343 | Επισκ.Ε.Δ
 344 | Επισκ.Εμπ.Δικ
 345 | Επιστ.Επετ.Αρμ
 346 | Επιστ.Επετ
 347 | Επιστ.Ιερ
 348 | Επιτρ.Προστ.Συνδ.Στελ
 349 | Επιφάν
 350 | Επτ.Εφ
 351 | Επ.Ιρ
 352 | Επ.Ι
 353 | Εργ.Ασφ.Νομ
 354 | Ερμ.Α.Κ
 355 | Ερμη.Σ
 356 | Εσθ
 357 | Εσπερ
 358 | Ετρ.Δ
 359 | Ευκλ
 360 | Ευρ.Δ.Δ.Α
 361 | Ευρ.Σ.Δ.Α
 362 | Ευρ.ΣτΕ
 363 | Ευρατόμ
 364 | Ευρ.Άλκ
 365 | Ευρ.Ανδρομ
 366 | Ευρ.Βάκχ
 367 | Ευρ.Εκ
 368 | Ευρ.Ελ
 369 | Ευρ.Ηλ
 370 | Ευρ.Ηρακ
 371 | Ευρ.Ηρ
 372 | Ευρ.Ηρ.Μαιν
 373 | Ευρ.Ικέτ
 374 | Ευρ.Ιππόλ
 375 | Ευρ.Ιφ.Α
 376 | Ευρ.Ιφ.Τ
 377 | Ευρ.Ι.Τ
 378 | Ευρ.Κύκλ
 379 | Ευρ.Μήδ
 380 | Ευρ.Ορ
 381 | Ευρ.Ρήσ
 382 | Ευρ.Τρωάδ
 383 | Ευρ.Φοίν
 384 | Εφ.Αθ
 385 | Εφ.Εν
 386 | Εφ.Επ
 387 | Εφ.Θρ
 388 | Εφ.Θ
 389 | Εφ.Ι
 390 | Εφ.Κερ
 391 | Εφ.Κρ
 392 | Εφ.Λ
 393 | Εφ.Ν
 394 | Εφ.Πατ
 395 | Εφ.Πειρ
 396 | Εφαρμ.Δ.Δ
 397 | Εφαρμ
 398 | Εφεσ
 399 | Εφημ
 400 | Εφ
 401 | Ζαχ
 402 | Ζιγ
 403 | Ζυ
 404 | Ζχ
 405 | ΗΕ.Δ
 406 | Ημερ
 407 | Ηράκλ
 408 | Ηροδ
 409 | Ησίοδ
 410 | Ησ
 411 | Η.Ε.Γ
 412 | ΘΗΣ
 413 | ΘΡ
 414 | Θαλ
 415 | Θεοδ
 416 | Θεοφ
 417 | Θεσ
 418 | Θεόδ.Μοψ
 419 | Θεόκρ
 420 | Θεόφιλ
 421 | Θουκ
 422 | Θρ
 423 | Θρ.Ε
 424 | Θρ.Ιερ
 425 | Θρ.Ιρ
 426 | Ιακ
 427 | Ιαν
 428 | Ιβ
 429 | Ιδθ
 430 | Ιδ
 431 | Ιεζ
 432 | Ιερ
 433 | Ιζ
 434 | Ιησ
 435 | Ιησ.Ν
 436 | Ικ
 437 | Ιλ
 438 | Ιν
 439 | Ιουδ
 440 | Ιουστ
 441 | Ιούδα
 442 | Ιούλ
 443 | Ιούν
 444 | Ιπποκρ
 445 | Ιππόλ
 446 | Ιρ
 447 | Ισίδ.Πηλ
 448 | Ισοκρ
 449 | Ισ.Ν
 450 | Ιωβ
 451 | Ιωλ
 452 | Ιων
 453 | Ιω
 454 | ΚΟΣ
 455 | ΚΟ.ΜΕ.ΚΟΝ
 456 | ΚΠοινΔ
 457 | ΚΠολΔ
 458 | ΚαΒ
 459 | Καλ
 460 | Καλ.Τέχν
 461 | ΚανΒ
 462 | Καν.Διαδ
 463 | Κατάργ
 464 | Κλ
 465 | ΚοινΔ
 466 | Κολσ
 467 | Κολ
 468 | Κον
 469 | Κορ
 470 | Κος
 471 | ΚριτΕπιθ
 472 | ΚριτΕ
 473 | Κριτ
 474 | Κρ
 475 | ΚτΒ
 476 | ΚτΕ
 477 | ΚτΠ
 478 | Κυβ
 479 | Κυπρ
 480 | Κύριλ.Αλεξ
 481 | Κύριλ.Ιερ
 482 | Λεβ
 483 | Λεξ.Σουίδα
 484 | Λευϊτ
 485 | Λευ
 486 | Λκ
 487 | Λογ
 488 | ΛουκΑμ
 489 | Λουκιαν
 490 | Λουκ.Έρωτ
 491 | Λουκ.Ενάλ.Διάλ
 492 | Λουκ.Ερμ
 493 | Λουκ.Εταιρ.Διάλ
 494 | Λουκ.Ε.Δ
 495 | Λουκ.Θε.Δ
 496 | Λουκ.Ικ.
 497 | Λουκ.Ιππ
 498 | Λουκ.Λεξιφ
 499 | Λουκ.Μεν
 500 | Λουκ.Μισθ.Συν
 501 | Λουκ.Ορχ
 502 | Λουκ.Περ
 503 | Λουκ.Συρ
 504 | Λουκ.Τοξ
 505 | Λουκ.Τυρ
 506 | Λουκ.Φιλοψ
 507 | Λουκ.Φιλ
 508 | Λουκ.Χάρ
 509 | Λουκ.
 510 | Λουκ.Αλ
 511 | Λοχ
 512 | Λυδ
 513 | Λυκ
 514 | Λυσ
 515 | Λωζ
 516 | Λ1
 517 | Λ2
 518 | ΜΟΕφ
 519 | Μάρκ
 520 | Μέν
 521 | Μαλ
 522 | Ματθ
 523 | Μα
 524 | Μιχ
 525 | Μκ
 526 | Μλ
 527 | Μμ
 528 | Μον.Δ.Π
 529 | Μον.Πρωτ
 530 | Μον
 531 | Μρ
 532 | Μτ
 533 | Μχ
 534 | Μ.Βασ
 535 | Μ.Πλ
 536 | ΝΑ
 537 | Ναυτ.Χρον
 538 | Να
 539 | Νδικ
 540 | Νεεμ
 541 | Νε
 542 | Νικ
 543 | ΝκΦ
 544 | Νμ
 545 | ΝοΒ
 546 | Νομ.Δελτ.Τρ.Ελ
 547 | Νομ.Δελτ
 548 | Νομ.Σ.Κ
 549 | Νομ.Χρ
 550 | Νομ
 551 | Νομ.Διεύθ
 552 | Νοσ
 553 | Ντ
 554 | Νόσων
 555 | Ν1
 556 | Ν2
 557 | Ν3
 558 | Ν4
 559 | Νtot
 560 | Ξενοφ
 561 | Ξεν
 562 | Ξεν.Ανάβ
 563 | Ξεν.Απολ
 564 | Ξεν.Απομν
 565 | Ξεν.Απομ
 566 | Ξεν.Ελλ
 567 | Ξεν.Ιέρ
 568 | Ξεν.Ιππαρχ
 569 | Ξεν.Ιππ
 570 | Ξεν.Κυρ.Αν
 571 | Ξεν.Κύρ.Παιδ
 572 | Ξεν.Κ.Π
 573 | Ξεν.Λακ.Πολ
 574 | Ξεν.Οικ
 575 | Ξεν.Προσ
 576 | Ξεν.Συμπόσ
 577 | Ξεν.Συμπ
 578 | Ο΄
 579 | Οβδ
 580 | Οβ
 581 | ΟικΕ
 582 | Οικ
 583 | Οικ.Πατρ
 584 | Οικ.Σύν.Βατ
 585 | Ολομ
 586 | Ολ
 587 | Ολ.Α.Π
 588 | Ομ.Ιλ
 589 | Ομ.Οδ
 590 | ΟπΤοιχ
 591 | Οράτ
 592 | Ορθ
 593 | ΠΡΟ.ΠΟ
 594 | Πίνδ
 595 | Πίνδ.Ι
 596 | Πίνδ.Νεμ
 597 | Πίνδ.Ν
 598 | Πίνδ.Ολ
 599 | Πίνδ.Παθ
 600 | Πίνδ.Πυθ
 601 | Πίνδ.Π
 602 | ΠαγΝμλγ
 603 | Παν
 604 | Παρμ
 605 | Παροιμ
 606 | Παρ
 607 | Παυσ
 608 | Πειθ.Συμβ
 609 | ΠειρΝ
 610 | Πελ
 611 | ΠεντΣτρ
 612 | Πεντ
 613 | Πεντ.Εφ
 614 | ΠερΔικ
 615 | Περ.Γεν.Νοσ
 616 | Πετ
 617 | Πλάτ
 618 | Πλάτ.Αλκ
 619 | Πλάτ.Αντ
 620 | Πλάτ.Αξίοχ
 621 | Πλάτ.Απόλ
 622 | Πλάτ.Γοργ
 623 | Πλάτ.Ευθ
 624 | Πλάτ.Θεαίτ
 625 | Πλάτ.Κρατ
 626 | Πλάτ.Κριτ
 627 | Πλάτ.Λύσ
 628 | Πλάτ.Μεν
 629 | Πλάτ.Νόμ
 630 | Πλάτ.Πολιτ
 631 | Πλάτ.Πολ
 632 | Πλάτ.Πρωτ
 633 | Πλάτ.Σοφ.
 634 | Πλάτ.Συμπ
 635 | Πλάτ.Τίμ
 636 | Πλάτ.Φαίδρ
 637 | Πλάτ.Φιλ
 638 | Πλημ
 639 | Πλούτ
 640 | Πλούτ.Άρατ
 641 | Πλούτ.Αιμ
 642 | Πλούτ.Αλέξ
 643 | Πλούτ.Αλκ
 644 | Πλούτ.Αντ
 645 | Πλούτ.Αρτ
 646 | Πλούτ.Ηθ
 647 | Πλούτ.Θεμ
 648 | Πλούτ.Κάμ
 649 | Πλούτ.Καίσ
 650 | Πλούτ.Κικ
 651 | Πλούτ.Κράσ
 652 | Πλούτ.Κ
 653 | Πλούτ.Λυκ
 654 | Πλούτ.Μάρκ
 655 | Πλούτ.Μάρ
 656 | Πλούτ.Περ
 657 | Πλούτ.Ρωμ
 658 | Πλούτ.Σύλλ
 659 | Πλούτ.Φλαμ
 660 | Πλ
 661 | Ποιν.Δικ
 662 | Ποιν.Δ
 663 | Ποιν.Ν
 664 | Ποιν.Χρον
 665 | Ποιν.Χρ
 666 | Πολ.Δ
 667 | Πολ.Πρωτ
 668 | Πολ
 669 | Πολ.Μηχ
 670 | Πολ.Μ
 671 | Πρακτ.Αναθ
 672 | Πρακτ.Ολ
 673 | Πραξ
 674 | Πρμ
 675 | Πρξ
 676 | Πρωτ
 677 | Πρ
 678 | Πρ.Αν
 679 | Πρ.Λογ
 680 | Πταισμ
 681 | Πυρ.Καλ
 682 | Πόλη
 683 | Π.Δ
 684 | Π.Δ.Άσμ
 685 | ΡΜ.Ε
 686 | Ρθ
 687 | Ρμ
 688 | Ρωμ
 689 | ΣΠλημ
 690 | Σαπφ
 691 | Σειρ
 692 | Σολ
 693 | Σοφ
 694 | Σοφ.Αντιγ
 695 | Σοφ.Αντ
 696 | Σοφ.Αποσ
 697 | Σοφ.Απ
 698 | Σοφ.Ηλέκ
 699 | Σοφ.Ηλ
 700 | Σοφ.Οιδ.Κολ
 701 | Σοφ.Οιδ.Τύρ
 702 | Σοφ.Ο.Τ
 703 | Σοφ.Σειρ
 704 | Σοφ.Σολ
 705 | Σοφ.Τραχ
 706 | Σοφ.Φιλοκτ
 707 | Σρ
 708 | Σ.τ.Ε
 709 | Σ.τ.Π
 710 | Στρ.Π.Κ
 711 | Στ.Ευρ
 712 | Συζήτ
 713 | Συλλ.Νομολ
 714 | Συλ.Νομ
 715 | ΣυμβΕπιθ
 716 | Συμπ.Ν
 717 | Συνθ.Αμ
 718 | Συνθ.Ε.Ε
 719 | Συνθ.Ε.Κ
 720 | Συνθ.Ν
 721 | Σφν
 722 | Σφ
 723 | Σφ.Σλ
 724 | Σχ.Πολ.Δ
 725 | Σχ.Συντ.Ε
 726 | Σωσ
 727 | Σύντ
 728 | Σ.Πληρ
 729 | ΤΘ
 730 | ΤΣ.Δ
 731 | Τίτ
 732 | Τβ
 733 | Τελ.Ενημ
 734 | Τελ.Κ
 735 | Τερτυλ
 736 | Τιμ
 737 | Τοπ.Α
 738 | Τρ.Ο
 739 | Τριμ
 740 | Τριμ.Πλ
 741 | Τρ.Πλημ
 742 | Τρ.Π.Δ
 743 | Τ.τ.Ε
 744 | Ττ
 745 | Τωβ
 746 | Υγ
 747 | Υπερ
 748 | Υπ
 749 | Υ.Γ
 750 | Φιλήμ
 751 | Φιλιπ
 752 | Φιλ
 753 | Φλμ
 754 | Φλ
 755 | Φορ.Β
 756 | Φορ.Δ.Ε
 757 | Φορ.Δνη
 758 | Φορ.Δ
 759 | Φορ.Επ
 760 | Φώτ
 761 | Χρ.Ι.Δ
 762 | Χρ.Ιδ.Δ
 763 | Χρ.Ο
 764 | Χρυσ
 765 | Ψήφ
 766 | Ψαλμ
 767 | Ψαλ
 768 | Ψλ
 769 | Ωριγ
 770 | Ωσ
 771 | Ω.Ρ.Λ
 772 | άγν
 773 | άγν.ετυμολ
 774 | άγ
 775 | άκλ
 776 | άνθρ
 777 | άπ
 778 | άρθρ
 779 | άρν
 780 | άρ
 781 | άτ
 782 | άψ
 783 | ά
 784 | έκδ
 785 | έκφρ
 786 | έμψ
 787 | ένθ.αν
 788 | έτ
 789 | έ.α
 790 | ίδ
 791 | αβεστ
 792 | αβησσ
 793 | αγγλ
 794 | αγγ
 795 | αδημ
 796 | αεροναυτ
 797 | αερον
 798 | αεροπ
 799 | αθλητ
 800 | αθλ
 801 | αθροιστ
 802 | αιγυπτ
 803 | αιγ
 804 | αιτιολ
 805 | αιτ
 806 | αι
 807 | ακαδ
 808 | ακκαδ
 809 | αλβ
 810 | αλλ
 811 | αλφαβητ
 812 | αμα
 813 | αμερικ
 814 | αμερ
 815 | αμετάβ
 816 | αμτβ
 817 | αμφιβ
 818 | αμφισβ
 819 | αμφ
 820 | αμ
 821 | ανάλ
 822 | ανάπτ
 823 | ανάτ
 824 | αναβ
 825 | αναδαν
 826 | αναδιπλασ
 827 | αναδιπλ
 828 | αναδρ
 829 | αναλ
 830 | αναν
 831 | ανασυλλ
 832 | ανατολ
 833 | ανατομ
 834 | ανατυπ
 835 | ανατ
 836 | αναφορ
 837 | αναφ
 838 | ανα.ε
 839 | ανδρων
 840 | ανθρωπολ
 841 | ανθρωπ
 842 | ανθ
 843 | ανομ
 844 | αντίτ
 845 | αντδ
 846 | αντιγρ
 847 | αντιθ
 848 | αντικ
 849 | αντιμετάθ
 850 | αντων
 851 | αντ
 852 | ανωτ
 853 | ανόργ
 854 | ανών
 855 | αορ
 856 | απαρέμφ
 857 | απαρφ
 858 | απαρχ
 859 | απαρ
 860 | απλολ
 861 | απλοπ
 862 | αποβ
 863 | αποηχηροπ
 864 | αποθ
 865 | αποκρυφ
 866 | αποφ
 867 | απρμφ
 868 | απρφ
 869 | απρόσ
 870 | απόδ
 871 | απόλ
 872 | απόσπ
 873 | απόφ
 874 | αραβοτουρκ
 875 | αραβ
 876 | αραμ
 877 | αρβαν
 878 | αργκ
 879 | αριθμτ
 880 | αριθμ
 881 | αριθ
 882 | αρκτικόλ
 883 | αρκ
 884 | αρμεν
 885 | αρμ
 886 | αρνητ
 887 | αρσ
 888 | αρχαιολ
 889 | αρχιτεκτ
 890 | αρχιτ
 891 | αρχκ
 892 | αρχ
 893 | αρωμουν
 894 | αρωμ
 895 | αρ
 896 | αρ.μετρ
 897 | αρ.φ
 898 | ασσυρ
 899 | αστρολ
 900 | αστροναυτ
 901 | αστρον
 902 | αττ
 903 | αυστραλ
 904 | αυτοπ
 905 | αυτ
 906 | αφγαν
 907 | αφηρ
 908 | αφομ
 909 | αφρικ
 910 | αχώρ
 911 | αόρ
 912 | α.α
 913 | α/α
 914 | α0
 915 | βαθμ
 916 | βαθ
 917 | βαπτ
 918 | βασκ
 919 | βεβαιωτ
 920 | βεβ
 921 | βεδ
 922 | βενετ
 923 | βεν
 924 | βερβερ
 925 | βιβλγρ
 926 | βιολ
 927 | βιομ
 928 | βιοχημ
 929 | βιοχ
 930 | βλάχ
 931 | βλ
 932 | βλ.λ
 933 | βοταν
 934 | βοτ
 935 | βουλγαρ
 936 | βουλγ
 937 | βούλ
 938 | βραζιλ
 939 | βρετον
 940 | βόρ
 941 | γαλλ
 942 | γενικότ
 943 | γενοβ
 944 | γεν
 945 | γερμαν
 946 | γερμ
 947 | γεωγρ
 948 | γεωλ
 949 | γεωμετρ
 950 | γεωμ
 951 | γεωπ
 952 | γεωργ
 953 | γλυπτ
 954 | γλωσσολ
 955 | γλωσσ
 956 | γλ
 957 | γνμδ
 958 | γνμ
 959 | γνωμ
 960 | γοτθ
 961 | γραμμ
 962 | γραμ
 963 | γρμ
 964 | γρ
 965 | γυμν
 966 | δίδες
 967 | δίκ
 968 | δίφθ
 969 | δαν
 970 | δεικτ
 971 | δεκατ
 972 | δηλ
 973 | δημογρ
 974 | δημοτ
 975 | δημώδ
 976 | δημ
 977 | διάγρ
 978 | διάκρ
 979 | διάλεξ
 980 | διάλ
 981 | διάσπ
 982 | διαλεκτ
 983 | διατρ
 984 | διαφ
 985 | διαχ
 986 | διδα
 987 | διεθν
 988 | διεθ
 989 | δικον
 990 | διστ
 991 | δισύλλ
 992 | δισ
 993 | διφθογγοπ
 994 | δογμ
 995 | δολ
 996 | δοτ
 997 | δρμ
 998 | δρχ
 999 | δρ(α)
1000 | δωρ
1001 | δ
1002 | εβρ
1003 | εγκλπ
1004 | εδ
1005 | εθνολ
1006 | εθν
1007 | ειδικότ
1008 | ειδ
1009 | ειδ.β
1010 | εικ
1011 | ειρ
1012 | εισ
1013 | εκατοστμ
1014 | εκατοστ
1015 | εκατστ.2
1016 | εκατστ.3
1017 | εκατ
1018 | εκδ
1019 | εκκλησ
1020 | εκκλ
1021 | εκ
1022 | ελλην
1023 | ελλ
1024 | ελνστ
1025 | ελπ
1026 | εμβ
1027 | εμφ
1028 | εναλλ
1029 | ενδ
1030 | ενεργ
1031 | ενεστ
1032 | ενικ
1033 | ενν
1034 | εν
1035 | εξέλ
1036 | εξακολ
1037 | εξομάλ
1038 | εξ
1039 | εο
1040 | επέκτ
1041 | επίδρ
1042 | επίθ
1043 | επίρρ
1044 | επίσ
1045 | επαγγελμ
1046 | επανάλ
1047 | επανέκδ
1048 | επιθ
1049 | επικ
1050 | επιμ
1051 | επιρρ
1052 | επιστ
1053 | επιτατ
1054 | επιφ
1055 | επών
1056 | επ
1057 | εργ
1058 | ερμ
1059 | ερρινοπ
1060 | ερωτ
1061 | ετρουσκ
1062 | ετυμ
1063 | ετ
1064 | ευφ
1065 | ευχετ
1066 | εφ
1067 | εύχρ
1068 | ε.α
1069 | ε/υ
1070 | ε0
1071 | ζωγρ
1072 | ζωολ
1073 | ηθικ
1074 | ηθ
1075 | ηλεκτρολ
1076 | ηλεκτρον
1077 | ηλεκτρ
1078 | ημίτ
1079 | ημίφ
1080 | ημιφ
1081 | ηχηροπ
1082 | ηχηρ
1083 | ηχομιμ
1084 | ηχ
1085 | η
1086 | θέατρ
1087 | θεολ
1088 | θετ
1089 | θηλ
1090 | θρακ
1091 | θρησκειολ
1092 | θρησκ
1093 | θ
1094 | ιαπων
1095 | ιατρ
1096 | ιδιωμ
1097 | ιδ
1098 | ινδ
1099 | ιραν
1100 | ισπαν
1101 | ιστορ
1102 | ιστ
1103 | ισχυροπ
1104 | ιταλ
1105 | ιχθυολ
1106 | ιων
1107 | κάτ
1108 | καθ
1109 | κακοσ
1110 | καν
1111 | καρ
1112 | κατάλ
1113 | κατατ
1114 | κατωτ
1115 | κατ
1116 | κα
1117 | κελτ
1118 | κεφ
1119 | κινεζ
1120 | κινημ
1121 | κλητ
1122 | κλιτ
1123 | κλπ
1124 | κλ
1125 | κν
1126 | κοινωνιολ
1127 | κοινων
1128 | κοπτ
1129 | κουτσοβλαχ
1130 | κουτσοβλ
1131 | κπ
1132 | κρ.γν
1133 | κτγ
1134 | κτην
1135 | κτητ
1136 | κτλ
1137 | κτ
1138 | κυριολ
1139 | κυρ
1140 | κύρ
1141 | κ
1142 | κ.ά
1143 | κ.ά.π
1144 | κ.α
1145 | κ.εξ
1146 | κ.επ
1147 | κ.ε
1148 | κ.λπ
1149 | κ.λ.π
1150 | κ.ού.κ
1151 | κ.ο.κ
1152 | κ.τ.λ
1153 | κ.τ.τ
1154 | κ.τ.ό
1155 | λέξ
1156 | λαογρ
1157 | λαπ
1158 | λατιν
1159 | λατ
1160 | λαϊκότρ
1161 | λαϊκ
1162 | λετ
1163 | λιθ
1164 | λογιστ
1165 | λογοτ
1166 | λογ
1167 | λουβ
1168 | λυδ
1169 | λόγ
1170 | λ
1171 | λ.χ
1172 | μέλλ
1173 | μέσ
1174 | μαθημ
1175 | μαθ
1176 | μαιευτ
1177 | μαλαισ
1178 | μαλτ
1179 | μαμμων
1180 | μεγεθ
1181 | μεε
1182 | μειωτ
1183 | μελ
1184 | μεξ
1185 | μεσν
1186 | μεσογ
1187 | μεσοπαθ
1188 | μεσοφ
1189 | μετάθ
1190 | μεταβτ
1191 | μεταβ
1192 | μετακ
1193 | μεταπλ
1194 | μεταπτωτ
1195 | μεταρ
1196 | μεταφορ
1197 | μετβ
1198 | μετεπιθ
1199 | μετεπιρρ
1200 | μετεωρολ
1201 | μετεωρ
1202 | μετον
1203 | μετουσ
1204 | μετοχ
1205 | μετρ
1206 | μετ
1207 | μητρων
1208 | μηχανολ
1209 | μηχ
1210 | μικροβιολ
1211 | μογγολ
1212 | μορφολ
1213 | μουσ
1214 | μπενελούξ
1215 | μσνλατ
1216 | μσν
1217 | μτβ
1218 | μτγν
1219 | μτγ
1220 | μτφρδ
1221 | μτφρ
1222 | μτφ
1223 | μτχ
1224 | μυθ
1225 | μυκην
1226 | μυκ
1227 | μφ
1228 | μ
1229 | μ.ε
1230 | μ.μ
1231 | μ.π.ε
1232 | μ.π.π
1233 | μ0
1234 | ναυτ
1235 | νεοελλ
1236 | νεολατιν
1237 | νεολατ
1238 | νεολ
1239 | νεότ
1240 | νλατ
1241 | νομ
1242 | νορβ
1243 | νοσ
1244 | νότ
1245 | ν
1246 | ξ.λ
1247 | οικοδ
1248 | οικολ
1249 | οικον
1250 | οικ
1251 | ολλανδ
1252 | ολλ
1253 | ομηρ
1254 | ομόρρ
1255 | ονομ
1256 | ον
1257 | οπτ
1258 | ορθογρ
1259 | ορθ
1260 | οριστ
1261 | ορυκτολ
1262 | ορυκτ
1263 | ορ
1264 | οσετ
1265 | οσκ
1266 | ουαλ
1267 | ουγγρ
1268 | ουδ
1269 | ουσιαστικοπ
1270 | ουσιαστ
1271 | ουσ
1272 | πίν
1273 | παθητ
1274 | παθολ
1275 | παθ
1276 | παιδ
1277 | παλαιοντ
1278 | παλαιότ
1279 | παλ
1280 | παππων
1281 | παράγρ
1282 | παράγ
1283 | παράλλ
1284 | παράλ
1285 | παραγ
1286 | παρακ
1287 | παραλ
1288 | παραπ
1289 | παρατ
1290 | παρβ
1291 | παρετυμ
1292 | παροξ
1293 | παρων
1294 | παρωχ
1295 | παρ
1296 | παρ.φρ
1297 | πατριδων
1298 | πατρων
1299 | πβ
1300 | περιθ
1301 | περιλ
1302 | περιφρ
1303 | περσ
1304 | περ
1305 | πιθ
1306 | πληθ
1307 | πληροφ
1308 | ποδ
1309 | ποιητ
1310 | πολιτ
1311 | πολλαπλ
1312 | πολ
1313 | πορτογαλ
1314 | πορτ
1315 | ποσ
1316 | πρακριτ
1317 | πρβλ
1318 | πρβ
1319 | πργ
1320 | πρκμ
1321 | πρκ
1322 | πρλ
1323 | προέλ
1324 | προβηγκ
1325 | προελλ
1326 | προηγ
1327 | προθεμ
1328 | προπαραλ
1329 | προπαροξ
1330 | προπερισπ
1331 | προσαρμ
1332 | προσηγορ
1333 | προσταχτ
1334 | προστ
1335 | προσφών
1336 | προσ
1337 | προτακτ
1338 | προτ.Εισ
1339 | προφ
1340 | προχωρ
1341 | πρτ
1342 | πρόθ
1343 | πρόσθ
1344 | πρόσ
1345 | πρότ
1346 | πρ
1347 | πρ.Εφ
1348 | πτ
1349 | πυ
1350 | π
1351 | π.Χ
1352 | π.μ
1353 | π.χ
1354 | ρήμ
1355 | ρίζ
1356 | ρηματ
1357 | ρητορ
1358 | ριν
1359 | ρουμ
1360 | ρωμ
1361 | ρωσ
1362 | ρ
1363 | σανσκρ
1364 | σαξ
1365 | σελ
1366 | σερβοκρ
1367 | σερβ
1368 | σημασιολ
1369 | σημδ
1370 | σημειολ
1371 | σημερ
1372 | σημιτ
1373 | σημ
1374 | σκανδ
1375 | σκυθ
1376 | σκωπτ
1377 | σλαβ
1378 | σλοβ
1379 | σουηδ
1380 | σουμερ
1381 | σουπ
1382 | σπάν
1383 | σπανιότ
1384 | σπ
1385 | σσ
1386 | στατ
1387 | στερ
1388 | στιγμ
1389 | στιχ
1390 | στρέμ
1391 | στρατιωτ
1392 | στρατ
1393 | στ
1394 | συγγ
1395 | συγκρ
1396 | συγκ
1397 | συμπερ
1398 | συμπλεκτ
1399 | συμπλ
1400 | συμπροφ
1401 | συμφυρ
1402 | συμφ
1403 | συνήθ
1404 | συνίζ
1405 | συναίρ
1406 | συναισθ
1407 | συνδετ
1408 | συνδ
1409 | συνεκδ
1410 | συνηρ
1411 | συνθετ
1412 | συνθ
1413 | συνοπτ
1414 | συντελ
1415 | συντομογρ
1416 | συντ
1417 | συν
1418 | συρ
1419 | σχημ
1420 | σχ
1421 | σύγκρ
1422 | σύμπλ
1423 | σύμφ
1424 | σύνδ
1425 | σύνθ
1426 | σύντμ
1427 | σύντ
1428 | σ
1429 | σ.π
1430 | σ/β
1431 | τακτ
1432 | τελ
1433 | τετρ
1434 | τετρ.μ
1435 | τεχνλ
1436 | τεχνολ
1437 | τεχν
1438 | τεύχ
1439 | τηλεπικ
1440 | τηλεόρ
1441 | τιμ
1442 | τιμ.τομ
1443 | τοΣ
1444 | τον
1445 | τοπογρ
1446 | τοπων
1447 | τοπ
1448 | τοσκ
1449 | τουρκ
1450 | τοχ
1451 | τριτοπρόσ
1452 | τροποπ
1453 | τροπ
1454 | τσεχ
1455 | τσιγγ
1456 | ττ
1457 | τυπ
1458 | τόμ
1459 | τόνν
1460 | τ
1461 | τ.μ
1462 | τ.χλμ
1463 | υβρ
1464 | υπερθ
1465 | υπερσ
1466 | υπερ
1467 | υπεύθ
1468 | υποθ
1469 | υποκορ
1470 | υποκ
1471 | υποσημ
1472 | υποτ
1473 | υποφ
1474 | υποχωρ
1475 | υπόλ
1476 | υπόχρ
1477 | υπ
1478 | υστλατ
1479 | υψόμ
1480 | υψ
1481 | φάκ
1482 | φαρμακολ
1483 | φαρμ
1484 | φιλολ
1485 | φιλοσ
1486 | φιλοτ
1487 | φινλ
1488 | φοινικ
1489 | φράγκ
1490 | φρανκον
1491 | φριζ
1492 | φρ
1493 | φυλλ
1494 | φυσιολ
1495 | φυσ
1496 | φωνηεντ
1497 | φωνητ
1498 | φωνολ
1499 | φων
1500 | φωτογρ
1501 | φ
1502 | φ.τ.μ
1503 | χαμιτ
1504 | χαρτόσ
1505 | χαρτ
1506 | χασμ
1507 | χαϊδ
1508 | χγφ
1509 | χειλ
1510 | χεττ
1511 | χημ
1512 | χιλ
1513 | χλγρ
1514 | χλγ
1515 | χλμ
1516 | χλμ.2
1517 | χλμ.3
1518 | χλσγρ
1519 | χλστγρ
1520 | χλστμ
1521 | χλστμ.2
1522 | χλστμ.3
1523 | χλ
1524 | χργρ
1525 | χρημ
1526 | χρον
1527 | χρ
1528 | χφ
1529 | χ.ε
1530 | χ.κ
1531 | χ.ο
1532 | χ.σ
1533 | χ.τ
1534 | χ.χ
1535 | ψευδ
1536 | ψυχαν
1537 | ψυχιατρ
1538 | ψυχολ
1539 | ψυχ
1540 | ωκεαν
1541 | όμ
1542 | όν
1543 | όπ.παρ
1544 | όπ.π
1545 | ό.π
1546 | ύψ
1547 | 1Βσ
1548 | 1Εσ
1549 | 1Θσ
1550 | 1Ιν
1551 | 1Κρ
1552 | 1Μκ
1553 | 1Πρ
1554 | 1Πτ
1555 | 1Τμ
1556 | 2Βσ
1557 | 2Εσ
1558 | 2Θσ
1559 | 2Ιν
1560 | 2Κρ
1561 | 2Μκ
1562 | 2Πρ
1563 | 2Πτ
1564 | 2Τμ
1565 | 3Βσ
1566 | 3Ιν
1567 | 3Μκ
1568 | 4Βσ
1569 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                   GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 2.1, February 1999
  3 | 
  4 |  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
  5 |  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | [This is the first released version of the Lesser GPL.  It also counts
 10 |  as the successor of the GNU Library Public License, version 2, hence
 11 |  the version number 2.1.]
 12 | 
 13 |                             Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Lesser General Public License, applies to some
 21 | specially designated software packages--typically libraries--of the
 22 | Free Software Foundation and other authors who decide to use it.  You
 23 | can use it too, but we suggest you first think carefully about whether
 24 | this license or the ordinary General Public License is the better
 25 | strategy to use in any particular case, based on the explanations below.
 26 | 
 27 |   When we speak of free software, we are referring to freedom of use,
 28 | not price.  Our General Public Licenses are designed to make sure that
 29 | you have the freedom to distribute copies of free software (and charge
 30 | for this service if you wish); that you receive source code or can get
 31 | it if you want it; that you can change the software and use pieces of
 32 | it in new free programs; and that you are informed that you can do
 33 | these things.
 34 | 
 35 |   To protect your rights, we need to make restrictions that forbid
 36 | distributors to deny you these rights or to ask you to surrender these
 37 | rights.  These restrictions translate to certain responsibilities for
 38 | you if you distribute copies of the library or if you modify it.
 39 | 
 40 |   For example, if you distribute copies of the library, whether gratis
 41 | or for a fee, you must give the recipients all the rights that we gave
 42 | you.  You must make sure that they, too, receive or can get the source
 43 | code.  If you link other code with the library, you must provide
 44 | complete object files to the recipients, so that they can relink them
 45 | with the library after making changes to the library and recompiling
 46 | it.  And you must show them these terms so they know their rights.
 47 | 
 48 |   We protect your rights with a two-step method: (1) we copyright the
 49 | library, and (2) we offer you this license, which gives you legal
 50 | permission to copy, distribute and/or modify the library.
 51 | 
 52 |   To protect each distributor, we want to make it very clear that
 53 | there is no warranty for the free library.  Also, if the library is
 54 | modified by someone else and passed on, the recipients should know
 55 | that what they have is not the original version, so that the original
 56 | author's reputation will not be affected by problems that might be
 57 | introduced by others.
 58 | 
 59 |   Finally, software patents pose a constant threat to the existence of
 60 | any free program.  We wish to make sure that a company cannot
 61 | effectively restrict the users of a free program by obtaining a
 62 | restrictive license from a patent holder.  Therefore, we insist that
 63 | any patent license obtained for a version of the library must be
 64 | consistent with the full freedom of use specified in this license.
 65 | 
 66 |   Most GNU software, including some libraries, is covered by the
 67 | ordinary GNU General Public License.  This license, the GNU Lesser
 68 | General Public License, applies to certain designated libraries, and
 69 | is quite different from the ordinary General Public License.  We use
 70 | this license for certain libraries in order to permit linking those
 71 | libraries into non-free programs.
 72 | 
 73 |   When a program is linked with a library, whether statically or using
 74 | a shared library, the combination of the two is legally speaking a
 75 | combined work, a derivative of the original library.  The ordinary
 76 | General Public License therefore permits such linking only if the
 77 | entire combination fits its criteria of freedom.  The Lesser General
 78 | Public License permits more lax criteria for linking other code with
 79 | the library.
 80 | 
 81 |   We call this license the "Lesser" General Public License because it
 82 | does Less to protect the user's freedom than the ordinary General
 83 | Public License.  It also provides other free software developers Less
 84 | of an advantage over competing non-free programs.  These disadvantages
 85 | are the reason we use the ordinary General Public License for many
 86 | libraries.  However, the Lesser license provides advantages in certain
 87 | special circumstances.
 88 | 
 89 |   For example, on rare occasions, there may be a special need to
 90 | encourage the widest possible use of a certain library, so that it becomes
 91 | a de-facto standard.  To achieve this, non-free programs must be
 92 | allowed to use the library.  A more frequent case is that a free
 93 | library does the same job as widely used non-free libraries.  In this
 94 | case, there is little to gain by limiting the free library to free
 95 | software only, so we use the Lesser General Public License.
 96 | 
 97 |   In other cases, permission to use a particular library in non-free
 98 | programs enables a greater number of people to use a large body of
 99 | free software.  For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 | 
104 |   Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 | 
109 |   The precise terms and conditions for copying, distribution and
110 | modification follow.  Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library".  The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 | 
115 |                   GNU LESSER GENERAL PUBLIC LICENSE
116 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 | 
118 |   0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 | 
124 |   A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 | 
128 |   The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms.  A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language.  (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 | 
136 |   "Source code" for a work means the preferred form of the work for
137 | making modifications to it.  For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 | 
142 |   Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope.  The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it).  Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 | 
150 |   1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 | 
158 |   You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 | 
162 |   2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 | 
167 |     a) The modified work must itself be a software library.
168 | 
169 |     b) You must cause the files modified to carry prominent notices
170 |     stating that you changed the files and the date of any change.
171 | 
172 |     c) You must cause the whole of the work to be licensed at no
173 |     charge to all third parties under the terms of this License.
174 | 
175 |     d) If a facility in the modified Library refers to a function or a
176 |     table of data to be supplied by an application program that uses
177 |     the facility, other than as an argument passed when the facility
178 |     is invoked, then you must make a good faith effort to ensure that,
179 |     in the event an application does not supply such function or
180 |     table, the facility still operates, and performs whatever part of
181 |     its purpose remains meaningful.
182 | 
183 |     (For example, a function in a library to compute square roots has
184 |     a purpose that is entirely well-defined independent of the
185 |     application.  Therefore, Subsection 2d requires that any
186 |     application-supplied function or table used by this function must
187 |     be optional: if the application does not supply it, the square
188 |     root function must still compute square roots.)
189 | 
190 | These requirements apply to the modified work as a whole.  If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works.  But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 | 
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 | 
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 | 
211 |   3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library.  To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License.  (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.)  Do not make any other change in
218 | these notices.
219 | 
220 |   Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 | 
224 |   This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 | 
227 |   4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 | 
234 |   If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 | 
240 |   5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library".  Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 | 
246 |   However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library".  The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 | 
252 |   When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library.  The
257 | threshold for this to be true is not precisely defined by law.
258 | 
259 |   If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work.  (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 | 
266 |   Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 | 
271 |   6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 | 
278 |   You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License.  You must supply a copy of this License.  If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License.  Also, you must do one
284 | of these things:
285 | 
286 |     a) Accompany the work with the complete corresponding
287 |     machine-readable source code for the Library including whatever
288 |     changes were used in the work (which must be distributed under
289 |     Sections 1 and 2 above); and, if the work is an executable linked
290 |     with the Library, with the complete machine-readable "work that
291 |     uses the Library", as object code and/or source code, so that the
292 |     user can modify the Library and then relink to produce a modified
293 |     executable containing the modified Library.  (It is understood
294 |     that the user who changes the contents of definitions files in the
295 |     Library will not necessarily be able to recompile the application
296 |     to use the modified definitions.)
297 | 
298 |     b) Use a suitable shared library mechanism for linking with the
299 |     Library.  A suitable mechanism is one that (1) uses at run time a
300 |     copy of the library already present on the user's computer system,
301 |     rather than copying library functions into the executable, and (2)
302 |     will operate properly with a modified version of the library, if
303 |     the user installs one, as long as the modified version is
304 |     interface-compatible with the version that the work was made with.
305 | 
306 |     c) Accompany the work with a written offer, valid for at
307 |     least three years, to give the same user the materials
308 |     specified in Subsection 6a, above, for a charge no more
309 |     than the cost of performing this distribution.
310 | 
311 |     d) If distribution of the work is made by offering access to copy
312 |     from a designated place, offer equivalent access to copy the above
313 |     specified materials from the same place.
314 | 
315 |     e) Verify that the user has already received a copy of these
316 |     materials or that you have already sent this user a copy.
317 | 
318 |   For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it.  However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 | 
327 |   It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system.  Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 | 
333 |   7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 | 
340 |     a) Accompany the combined library with a copy of the same work
341 |     based on the Library, uncombined with any other library
342 |     facilities.  This must be distributed under the terms of the
343 |     Sections above.
344 | 
345 |     b) Give prominent notice with the combined library of the fact
346 |     that part of it is a work based on the Library, and explaining
347 |     where to find the accompanying uncombined form of the same work.
348 | 
349 |   8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License.  Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License.  However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 | 
357 |   9. You are not required to accept this License, since you have not
358 | signed it.  However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works.  These actions are
360 | prohibited by law if you do not accept this License.  Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 | 
366 |   10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions.  You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 | 
374 |   11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License.  If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all.  For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 | 
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 | 
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices.  Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 | 
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 | 
405 |   12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded.  In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 | 
413 |   13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 | 
418 | Each version is given a distinguishing version number.  If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation.  If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 | 
426 |   14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission.  For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this.  Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 | 
435 |                             NO WARRANTY
436 | 
437 |   15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 | 
447 |   16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 | 
458 |                      END OF TERMS AND CONDITIONS
459 | 
460 | 


--------------------------------------------------------------------------------