├── .github
    └── workflows
    │   └── ubuntu.yml
├── .gitignore
├── CMakeLists.txt
├── COPYING
├── COPYING.LESSER
├── FindICU.cmake
├── LICENSE
├── README.md
├── moses
    ├── CMakeLists.txt
    ├── ems
    │   └── support
    │   │   └── split-sentences.perl
    ├── share
    │   └── nonbreaking_prefixes
    │   │   ├── README.txt
    │   │   ├── nonbreaking_prefix.as
    │   │   ├── nonbreaking_prefix.bn
    │   │   ├── nonbreaking_prefix.ca
    │   │   ├── nonbreaking_prefix.cs
    │   │   ├── nonbreaking_prefix.de
    │   │   ├── nonbreaking_prefix.el
    │   │   ├── nonbreaking_prefix.en
    │   │   ├── nonbreaking_prefix.es
    │   │   ├── nonbreaking_prefix.et
    │   │   ├── nonbreaking_prefix.fi
    │   │   ├── nonbreaking_prefix.fr
    │   │   ├── nonbreaking_prefix.ga
    │   │   ├── nonbreaking_prefix.gu
    │   │   ├── nonbreaking_prefix.hi
    │   │   ├── nonbreaking_prefix.hu
    │   │   ├── nonbreaking_prefix.is
    │   │   ├── nonbreaking_prefix.it
    │   │   ├── nonbreaking_prefix.kn
    │   │   ├── nonbreaking_prefix.lt
    │   │   ├── nonbreaking_prefix.lv
    │   │   ├── nonbreaking_prefix.ml
    │   │   ├── nonbreaking_prefix.mni
    │   │   ├── nonbreaking_prefix.mr
    │   │   ├── nonbreaking_prefix.nl
    │   │   ├── nonbreaking_prefix.or
    │   │   ├── nonbreaking_prefix.pa
    │   │   ├── nonbreaking_prefix.pl
    │   │   ├── nonbreaking_prefix.pt
    │   │   ├── nonbreaking_prefix.ro
    │   │   ├── nonbreaking_prefix.ru
    │   │   ├── nonbreaking_prefix.sk
    │   │   ├── nonbreaking_prefix.sl
    │   │   ├── nonbreaking_prefix.sv
    │   │   ├── nonbreaking_prefix.ta
    │   │   ├── nonbreaking_prefix.te
    │   │   ├── nonbreaking_prefix.yue
    │   │   └── nonbreaking_prefix.zh
    └── tokenizer
    │   ├── deescape-special-chars.perl
    │   ├── detokenizer.perl
    │   ├── escape-special-chars.perl
    │   ├── lowercase.perl
    │   ├── normalize-punctuation.perl
    │   └── tokenizer.perl
├── preprocess
    ├── CMakeLists.txt
    ├── apply_case_main.cc
    ├── b64filter_main.cc
    ├── base64.cc
    ├── base64.hh
    ├── base64_number_main.cc
    ├── cache_main.cc
    ├── captive_child.cc
    ├── captive_child.hh
    ├── commoncrawl_dedupe_main.cc
    ├── dedupe_main.cc
    ├── docenc_main.cc
    ├── fields.cc
    ├── fields.hh
    ├── foldfilter_main.cc
    ├── gigaword_extract.sh
    ├── gigaword_unwrap_main.cc
    ├── heuristics.perl
    ├── idf_main.cc
    ├── mmhsum_main.cc
    ├── order_independent_hash_main.cc
    ├── parallel.hh
    ├── process_unicode_main.cc
    ├── remove_invalid_utf8_base64_main.cc
    ├── remove_invalid_utf8_main.cc
    ├── remove_long_lines_main.cc
    ├── resplit.sh
    ├── shard_main.cc
    ├── simple_cleaning_main.cc
    ├── substitute_main.cc
    ├── subtract_lines_main.cc
    ├── tests
    │   ├── cache
    │   │   ├── input
    │   │   ├── run.sh
    │   │   ├── space_expected
    │   │   └── space_ref.py
    │   ├── dedupe
    │   │   ├── columns
    │   │   ├── columns.out
    │   │   ├── expected
    │   │   ├── input
    │   │   ├── ref.py
    │   │   └── run.sh
    │   ├── foldfilter
    │   │   ├── fold10.expected
    │   │   ├── input
    │   │   └── run.sh
    │   ├── run.sh
    │   ├── shard
    │   │   ├── input
    │   │   └── run.sh
    │   └── vars
    ├── text.sh
    ├── train_case_main.cc
    ├── truecase_main.cc
    ├── unescape_html.perl
    ├── vocab_main.cc
    ├── warc.cc
    ├── warc.hh
    └── warc_parallel_main.cc
└── util
    ├── CMakeLists.txt
    ├── buffered_stream.hh
    ├── cat_compressed_main.cc
    ├── compress.cc
    ├── compress.hh
    ├── compress_test.cc
    ├── double-conversion
        ├── CMakeLists.txt
        ├── Jamfile
        ├── LICENSE
        ├── bignum-dtoa.cc
        ├── bignum-dtoa.h
        ├── bignum.cc
        ├── bignum.h
        ├── cached-powers.cc
        ├── cached-powers.h
        ├── diy-fp.cc
        ├── diy-fp.h
        ├── double-conversion.cc
        ├── double-conversion.h
        ├── fast-dtoa.cc
        ├── fast-dtoa.h
        ├── fixed-dtoa.cc
        ├── fixed-dtoa.h
        ├── ieee.h
        ├── strtod.cc
        ├── strtod.h
        └── utils.h
    ├── ersatz_progress.cc
    ├── ersatz_progress.hh
    ├── exception.cc
    ├── exception.hh
    ├── fake_ostream.hh
    ├── file.cc
    ├── file.hh
    ├── file_piece.cc
    ├── file_piece.hh
    ├── file_piece_test.cc
    ├── file_stream.hh
    ├── fixed_array.hh
    ├── float_to_string.cc
    ├── float_to_string.hh
    ├── have.hh
    ├── integer_to_string.cc
    ├── integer_to_string.hh
    ├── integer_to_string_test.cc
    ├── mmap.cc
    ├── mmap.hh
    ├── murmur_hash.cc
    ├── murmur_hash.hh
    ├── mutable_vocab.cc
    ├── mutable_vocab.hh
    ├── mutable_vocab_test.cc
    ├── object_pool.hh
    ├── pcqueue.hh
    ├── pcqueue_test.cc
    ├── pool.cc
    ├── pool.hh
    ├── probing_hash_table.hh
    ├── probing_hash_table_test.cc
    ├── scoped.cc
    ├── scoped.hh
    ├── spaces.cc
    ├── spaces.hh
    ├── string_piece.cc
    ├── string_piece.hh
    ├── string_stream.hh
    ├── string_stream_test.cc
    ├── threaded_buffered_stream.hh
    ├── tokenize_piece.hh
    ├── tokenize_piece_test.cc
    ├── utf8.cc
    ├── utf8.hh
    ├── utf8_icu.cc
    ├── utf8_icu.hh
    └── utf8_test.cc


/.github/workflows/ubuntu.yml:
--------------------------------------------------------------------------------
 1 | name: Ubuntu
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: [master]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: dependencies
16 |       run: sudo apt-get install -y build-essential libboost-test-dev libboost-program-options-dev cmake zlib1g-dev libbz2-dev liblzma-dev libicu-dev
17 |     - name: cmake
18 |       run: |
19 |         cmake -E make_directory build
20 |         cd build
21 |         cmake ..
22 |     - name: Compile
23 |       working-directory: build
24 |       run: cmake --build . -j2
25 |     - name: Unit Tests
26 |       working-directory: build
27 |       run: ctest -j2
28 |     - name: Regression Tests
29 |       run: preprocess/tests/run.sh
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | util/file_piece.cc.gz
3 | *.swp
4 | *.o
5 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.12)
 2 | 
 3 | # Define a single cmake project
 4 | project(preprocess)
 5 | 
 6 | #Set for FindICU.cmake
 7 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | 
10 | # Compile all executables into bin/
11 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
12 | 
13 | # Compile all libraries into lib/
14 | set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
15 | 
16 | if (NOT CMAKE_BUILD_TYPE)
17 |   set(CMAKE_BUILD_TYPE Release)
18 | endif()
19 | 
20 | option(COMPILE_TESTS "Compile tests" OFF)
21 | if (COMPILE_TESTS)
22 |   # Tell cmake that we want unit tests to be compiled
23 |   include(CTest)
24 |   enable_testing()
25 | endif()
26 | 
27 | if(MSVC)
28 |   set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /w34716")
29 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w34716")
30 | endif()
31 | 
32 | set(BOOST_LIBS program_options)
33 | if (COMPILE_TESTS)
34 |   set(BOOST_LIBS ${BOOST_LIBS} unit_test_framework)
35 | endif()
36 | 
37 | # We need boost for now to do program_options.
38 | find_package(Boost 1.41.0 REQUIRED COMPONENTS ${BOOST_LIBS})
39 | 
40 | find_package(ICU COMPONENTS i18n uc data io)
41 | include(CMakeDependentOption)
42 | cmake_dependent_option(USE_ICU "Build programs that use ICU" ON ICU_FOUND OFF)
43 | 
44 | # Define where include files live
45 | include_directories(
46 |   ${PROJECT_SOURCE_DIR}
47 |   ${Boost_INCLUDE_DIRS}
48 |   ${ICU_INCLUDE_DIRS}
49 | )
50 | 
51 | # Process subdirectories
52 | add_subdirectory(util)
53 | add_subdirectory(preprocess)
54 | add_subdirectory(moses)
55 | 
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Most of the code here is licensed under the LGPL.  There are exceptions which have their own licenses, listed below.  See comments in those files for more details.  
 2 | 
 3 | util/murmur_hash.cc is under the MIT license.  
 4 | util/string_piece.hh and util/string_piece.cc are Google code.  
 5 | util/file.cc contains a modified implementation of mkstemp under the LGPL.
 6 | FindICU.cmake is under BSD-2 clause license.
 7 | util/utf8.hh contains Google code under Apache-2.0.
 8 | 
 9 | For the rest:
10 | 
11 |     preprocess is free software: you can redistribute it and/or modify
12 |     it under the terms of the GNU Lesser General Public License as published
13 |     by the Free Software Foundation, either version 3 of the License, or
14 |     (at your option) any later version.
15 | 
16 |     Avenue code is distributed in the hope that it will be useful,
17 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |     GNU Lesser General Public License for more details.
20 | 
21 |     You should have received a copy of the GNU Lesser General Public License
22 |     along with Avenue code.  If not, see <http://www.gnu.org/licenses/>.
23 | 


--------------------------------------------------------------------------------
/moses/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | foreach(moses moses/tokenizer/normalize-punctuation.perl moses/tokenizer/escape-special-chars.perl moses/tokenizer/tokenizer.perl moses/tokenizer/lowercase.perl moses/tokenizer/detokenizer.perl moses/tokenizer/deescape-special-chars.perl moses/share/nonbreaking_prefixes/nonbreaking_prefix.ro moses/share/nonbreaking_prefixes/nonbreaking_prefix.sk moses/share/nonbreaking_prefixes/nonbreaking_prefix.it moses/share/nonbreaking_prefixes/nonbreaking_prefix.ru moses/share/nonbreaking_prefixes/nonbreaking_prefix.cs moses/share/nonbreaking_prefixes/nonbreaking_prefix.ca moses/share/nonbreaking_prefixes/nonbreaking_prefix.es moses/share/nonbreaking_prefixes/nonbreaking_prefix.is moses/share/nonbreaking_prefixes/README.txt moses/share/nonbreaking_prefixes/nonbreaking_prefix.pt moses/share/nonbreaking_prefixes/nonbreaking_prefix.sl moses/share/nonbreaking_prefixes/nonbreaking_prefix.pl moses/share/nonbreaking_prefixes/nonbreaking_prefix.nl moses/share/nonbreaking_prefixes/nonbreaking_prefix.sv moses/share/nonbreaking_prefixes/nonbreaking_prefix.el moses/share/nonbreaking_prefixes/nonbreaking_prefix.fr moses/share/nonbreaking_prefixes/nonbreaking_prefix.en moses/share/nonbreaking_prefixes/nonbreaking_prefix.de moses/ems/support/split-sentences.perl)
2 |   configure_file(../${moses} ../${moses} COPYONLY)
3 | endforeach()
4 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.as:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Dr
 5 | ড
 6 | 
 7 | #others
 8 | 
 9 | 
10 | #phonetics
11 | # A
12 | এ
13 | # B
14 | বি
15 | # C
16 | সি
17 | # D
18 | ডি
19 | # E
20 | ই
21 | # F
22 | এফ
23 | # G
24 | জি
25 | # H
26 | এইচ
27 | # I
28 | আম
29 | # J
30 | জে
31 | # K
32 | কে
33 | # L
34 | এল
35 | # M
36 | এম
37 | # N
38 | এন
39 | # O
40 | হে
41 | # P
42 | পি
43 | # Q
44 | কিউ
45 | # R
46 | আর
47 | # S
48 | এস
49 | # T
50 | টি
51 | # U
52 | ইউ
53 | # V
54 | ভি 
55 | # W
56 | ডব্লু
57 | # X
58 | এক্স
59 | # Y
60 | ওয়াই
61 | # Z
62 | জেড
63 | 
64 | #consonants
65 | 
66 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.bn:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Dr
 5 | ড
 6 | 
 7 | #others
 8 | 
 9 | 
10 | #phonetics
11 | # A
12 | এ
13 | # B
14 | বি
15 | # C
16 | সি
17 | # D
18 | ডি
19 | # E
20 | ই
21 | # F
22 | এফ
23 | # G
24 | জি
25 | # H
26 | এইচ
27 | # I
28 | আম
29 | # J
30 | জে
31 | # K
32 | কে
33 | # L
34 | এল
35 | # M
36 | এম
37 | # N
38 | এন
39 | # O
40 | হে
41 | # P
42 | পি
43 | # Q
44 | কিউ
45 | # R
46 | আর
47 | # S
48 | এস
49 | # T
50 | টি
51 | # U
52 | ইউ
53 | # V
54 | ভি 
55 | # W
56 | ডব্লু
57 | # X
58 | এক্স
59 | # Y
60 | ওয়াই
61 | # Z
62 | জেড
63 | 
64 | #consonants
65 | 
66 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | # rupees
 99 | Rs
100 | 
101 | #Numbers only. These should only induce breaks when followed by a numeric sequence
102 | # add NUMERIC_ONLY after the word for this function
103 | #This case is mostly for the english "No." which can either be a sentence of its own, or
104 | #if followed by a number, a non-breaking prefix
105 | No #NUMERIC_ONLY# 
106 | Nos
107 | Art #NUMERIC_ONLY#
108 | Nr
109 | pp #NUMERIC_ONLY#
110 | 
111 | #month abbreviations
112 | Jan
113 | Feb
114 | Mar
115 | Apr
116 | #May is a full word
117 | Jun
118 | Jul
119 | Aug
120 | Sep
121 | Oct
122 | Nov
123 | Dec
124 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.et:
--------------------------------------------------------------------------------
1 | nonbreaking_prefix.fi


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | #a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ga:
--------------------------------------------------------------------------------
 1 | 
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | Á
29 | É
30 | Í
31 | Ó
32 | Ú
33 | 
34 | Uacht
35 | Dr
36 | B.Arch
37 | 
38 | m.sh
39 | .i
40 | Co
41 | Cf
42 | cf
43 | i.e
44 | r
45 | Chr
46 | lch #NUMERIC_ONLY#
47 | lgh #NUMERIC_ONLY#
48 | uimh #NUMERIC_ONLY#
49 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.gu:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | 
  3 | #common exceptions
  4 | # Rs
  5 | રૂ
  6 | # Dr
  7 | ડો
  8 | # Dr
  9 | ડૉ
 10 | # Mr
 11 | શ્રી
 12 | 
 13 | #others
 14 | 
 15 | 
 16 | #phonetics
 17 | # A
 18 | એ
 19 | # B
 20 | બી
 21 | # C
 22 | સી
 23 | # D
 24 | ડી
 25 | # E
 26 | ઇ
 27 | # F
 28 | એફ
 29 | # G
 30 | જી
 31 | # H
 32 | એચ
 33 | # I
 34 | આઈ
 35 | # J
 36 | જે
 37 | # K
 38 | કે
 39 | # L
 40 | એલ
 41 | # M
 42 | એમ
 43 | # N
 44 | એન
 45 | # O
 46 | ઓ
 47 | # P
 48 | પી
 49 | # Q
 50 | ક્યૂ
 51 | # R
 52 | આર
 53 | # S
 54 | એસ
 55 | # T
 56 | ટી
 57 | # U
 58 | યુ
 59 | # V
 60 | વી
 61 | # W
 62 | ડબલ્યુ
 63 | # X
 64 | એક્સ
 65 | # Y
 66 | વાય
 67 | # Z
 68 | ઝેડ
 69 | 
 70 | #consonants
 71 | ક
 72 | ખ
 73 | ગ
 74 | ઘ
 75 | ઙ
 76 | ચ
 77 | છ
 78 | જ
 79 | ઝ
 80 | ઞ
 81 | ટ
 82 | ઠ
 83 | ડ
 84 | ઢ
 85 | ણ
 86 | ત
 87 | થ
 88 | દ
 89 | ધ
 90 | ન
 91 | પ
 92 | ફ
 93 | બ
 94 | ભ
 95 | મ
 96 | ય
 97 | ર
 98 | લ
 99 | ળ
100 | વ
101 | શ
102 | ષ
103 | સ
104 | હ
105 | 
106 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.hi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | 
  3 | #common exceptions
  4 | # Rs
  5 | रु
  6 | # Dr
  7 | डॉ
  8 | # Dr
  9 | डा
 10 | # Mr
 11 | श्री
 12 | 
 13 | #others
 14 | टीवी
 15 | 
 16 | #phonetics
 17 | # A
 18 | ए
 19 | ऐ
 20 | # B
 21 | बी
 22 | # C
 23 | सी
 24 | # D
 25 | डी
 26 | # E
 27 | ई
 28 | # F
 29 | ऐफ
 30 | एफ
 31 | # G
 32 | जी
 33 | # H
 34 | ऐच
 35 | एच
 36 | # I
 37 | आइ
 38 | # J
 39 | जे
 40 | # K
 41 | के
 42 | # L
 43 | ऐल
 44 | एल
 45 | # M
 46 | ऐम
 47 | एम
 48 | # N
 49 | ऐन
 50 | एन
 51 | # O
 52 | ओ
 53 | # P
 54 | पी
 55 | # Q
 56 | क्यू
 57 | # R
 58 | आर
 59 | # S
 60 | ऐस
 61 | एस
 62 | # T
 63 | टी
 64 | # U
 65 | यू
 66 | # V
 67 | वी
 68 | # W
 69 | डब्ल्यू
 70 | # X
 71 | ऐक्स
 72 | एक्स
 73 | # Y
 74 | वाय
 75 | वाई
 76 | # Z
 77 | ज़ैड
 78 | 
 79 | #consonants
 80 | क
 81 | ख
 82 | ग
 83 | घ
 84 | ङ
 85 | च
 86 | छ
 87 | ज
 88 | झ
 89 | ञ
 90 | ट
 91 | ठ
 92 | ड
 93 | ढ
 94 | ण
 95 | त
 96 | थ
 97 | द
 98 | ध
 99 | न
100 | प
101 | फ
102 | ब
103 | भ
104 | म
105 | य
106 | र
107 | ल
108 | व
109 | श
110 | ष
111 | स
112 | ह
113 | 
114 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.kn:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Rs
 5 | ರೂ
 6 | # Dr
 7 | ಡಾ
 8 | # Mr
 9 | ಶ್ರೀ
10 | 
11 | #others
12 | 
13 | 
14 | #phonetics
15 | # A
16 | ಎ
17 | # B
18 | ಬಿ
19 | # C
20 | ಸಿ
21 | # D
22 | ಡಿ
23 | # E
24 | ಇ
25 | # F
26 | ಎಫ್
27 | # G
28 | ಜಿ
29 | # H
30 | ಹೆಚ್
31 | ಎಚ್‌
32 | # I
33 | ಐ
34 | # J
35 | ಜೆ
36 | # K
37 | ಕೆ
38 | # L
39 | ಎಲ್
40 | # M
41 | ಎಂ
42 | # N
43 | ಎನ್
44 | # O
45 | ಒ
46 | # P
47 | ಪಿ
48 | # Q
49 | ಕ್ಯೂ
50 | # R
51 | ಆರ್
52 | # S
53 | ಎಸ್
54 | # T
55 | ಟಿ
56 | # U
57 | ಯು
58 | # V
59 | ವಿ
60 | # W
61 | ಡಬ್ಲ್ಯೂ
62 | # X
63 | ಎಕ್ಸ್
64 | # Y
65 | ವೈ
66 | # Z
67 | ಜೆಡ್
68 | 
69 | #consonants
70 | 
71 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ml:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Dr
 5 | ഡോ
 6 | # Mr
 7 | ശ്രീ
 8 | 
 9 | #others
10 | 
11 | 
12 | #phonetics
13 | # A
14 | എ
15 | # B
16 | ബി
17 | # C
18 | സി
19 | # D
20 | ഡി
21 | # E
22 | ഇ
23 | # F
24 | എഫ്
25 | # G
26 | ജി
27 | # H
28 | എച്ച്
29 | # I
30 | ഐ
31 | # J
32 | ജെ
33 | # K
34 | കെ
35 | # L
36 | എൽ
37 | # M
38 | എം
39 | # N
40 | എൻ
41 | # O
42 | ഒ
43 | # P
44 | പി 
45 | # Q
46 | ക്യൂ
47 | # R
48 | ആർ
49 | # S
50 | എസ്
51 | # T
52 | ടി
53 | # U
54 | യു
55 | # V
56 | വി
57 | # W
58 | ഡബ്ല്യു
59 | # X
60 | എക്സ്
61 | # Y
62 | വൈ
63 | # Z
64 | സെഡ്
65 | 
66 | #consonants
67 | 
68 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.mni:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Dr
 5 | দা
 6 | 
 7 | #others
 8 | 
 9 | 
10 | #phonetics
11 | # A
12 | এ
13 | # B
14 | বি
15 | # C
16 | সি
17 | # D
18 | ডি
19 | # E
20 | ই
21 | # F
22 | এফ
23 | # G
24 | জি
25 | # H
26 | এইচ
27 | # I
28 | আম
29 | # J
30 | জে
31 | # K
32 | কে
33 | # L
34 | এল
35 | # M
36 | এম
37 | # N
38 | এন
39 | # O
40 | হে
41 | # P
42 | পি
43 | # Q
44 | কিউ
45 | # R
46 | আর
47 | # S
48 | এস
49 | # T
50 | টি
51 | # U
52 | ইউ
53 | # V
54 | ভি 
55 | # W
56 | ডব্লু
57 | # X
58 | এক্স
59 | # Y
60 | ওয়াই
61 | # Z
62 | জেড
63 | 
64 | #consonants
65 | 
66 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.mr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | 
  3 | #common exceptions
  4 | # Rs
  5 | रु
  6 | # Dr
  7 | डॉ
  8 | # Dr
  9 | डा
 10 | # Mr
 11 | श्री
 12 | 
 13 | #others
 14 | 
 15 | 
 16 | #phonetics
 17 | # A
 18 | ए
 19 | ऐ
 20 | # B
 21 | बी
 22 | # C
 23 | सी
 24 | # D
 25 | डी
 26 | # E
 27 | ई
 28 | # F
 29 | ऐफ
 30 | एफ
 31 | # G
 32 | जी
 33 | # H
 34 | ऐच
 35 | एच
 36 | # I
 37 | आइ
 38 | # J
 39 | जे
 40 | # K
 41 | के
 42 | # L
 43 | ऐल
 44 | एल
 45 | # M
 46 | ऐम
 47 | एम
 48 | # N
 49 | ऐन
 50 | एन
 51 | # O
 52 | ओ
 53 | # P
 54 | पी
 55 | # Q
 56 | क्यू
 57 | # R
 58 | आर
 59 | # S
 60 | ऐस
 61 | एस
 62 | # T
 63 | टी
 64 | # U
 65 | यू
 66 | # V
 67 | वी
 68 | # W
 69 | डब्ल्यू
 70 | # X
 71 | ऐक्स
 72 | एक्स
 73 | # Y
 74 | वाय
 75 | वाई
 76 | # Z
 77 | ज़ैड
 78 | 
 79 | #consonants
 80 | क
 81 | ख
 82 | ग
 83 | घ
 84 | ङ
 85 | च
 86 | छ
 87 | ज
 88 | झ
 89 | ञ
 90 | ट
 91 | ठ
 92 | ड
 93 | ढ
 94 | ण
 95 | त
 96 | थ
 97 | द
 98 | ध
 99 | न
100 | प
101 | फ
102 | ब
103 | भ
104 | म
105 | य
106 | र
107 | ल
108 | व
109 | श
110 | ष
111 | स
112 | ह
113 | 
114 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.or:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | 
  3 | #common exceptions
  4 | # Mr
  5 | ରୀ
  6 | 
  7 | #others
  8 | 
  9 | 
 10 | #phonetics
 11 | # A
 12 | 
 13 | # B
 14 | 
 15 | # C
 16 | 
 17 | # D
 18 | 
 19 | # E
 20 | 
 21 | # F
 22 | 
 23 | # G
 24 | 
 25 | # H
 26 | 
 27 | # I
 28 | 
 29 | # J
 30 | 
 31 | # K
 32 | 
 33 | # L
 34 | 
 35 | # M
 36 | 
 37 | # N
 38 | 
 39 | # O
 40 | 
 41 | # P
 42 | 
 43 | # Q
 44 | 
 45 | # R
 46 | 
 47 | # S
 48 | 
 49 | # T
 50 | 
 51 | # U
 52 | 
 53 | # V
 54 | 
 55 | # W
 56 | 
 57 | # X
 58 | 
 59 | # Y
 60 | 
 61 | # Z
 62 | 
 63 | 
 64 | #consonants
 65 | କ
 66 | ଖ
 67 | ଗ
 68 | ଘ
 69 | ଙ
 70 | ଚ
 71 | ଛ
 72 | ଜ
 73 | ଝ
 74 | ଞ
 75 | ଟ
 76 | ଠ
 77 | ଡ
 78 | ଢ
 79 | ଣ
 80 | ତ
 81 | ଥ
 82 | ଦ
 83 | ଧ
 84 | ନ
 85 | ପ
 86 | ଫ
 87 | ବ
 88 | ଵ
 89 | ଭ
 90 | ମ
 91 | ଯ
 92 | ୟ
 93 | ର
 94 | ଲ
 95 | ଳ
 96 | ୱ
 97 | ଶ
 98 | ଷ
 99 | ସ
100 | ହ
101 | 
102 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.pa:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | 
  3 | #common exceptions
  4 | # Dr
  5 | ਡਾ
  6 | # Dr
  7 | ਪ੍ਰੋ
  8 | # Mr
  9 | ਸ੍ਰੀ
 10 | 
 11 | #others
 12 | 
 13 | 
 14 | #phonetics
 15 | # A
 16 | ਏ
 17 | # B
 18 | ਬੀ
 19 | # C
 20 | ਸੀ
 21 | # D
 22 | ਡੀ
 23 | # E
 24 | ਈ
 25 | # F
 26 | ਐੱਫ
 27 | # G
 28 | ਜੀ
 29 | # H
 30 | ਐਚ
 31 | # I
 32 | ਆਈ
 33 | # J
 34 | ਜੇ
 35 | # K
 36 | ਕੇ
 37 | # L
 38 | ਐਲ
 39 | # M
 40 | ਐੱਮ
 41 | # N
 42 | ਐੱਨ
 43 | # O
 44 | ਓ
 45 | # P
 46 | ਪੀ
 47 | # Q
 48 | ਕੀਓ
 49 | # R
 50 | ਆਰ
 51 | # S
 52 | ਐੱਸ
 53 | ਸ
 54 | # T
 55 | ਟੀ
 56 | # U
 57 | ਯੂ
 58 | # V
 59 | ਵੀ
 60 | # W
 61 | ਡਬਲਿਊ
 62 | # X
 63 | ਐਕ੍ਸ
 64 | # Y
 65 | ਵਾਈ
 66 | # Z
 67 | ਜ਼ੈਡ
 68 | 
 69 | #consonants
 70 | ਕ
 71 | ਖ
 72 | ਗ
 73 | ਘ
 74 | ਙ
 75 | ਚ
 76 | ਛ
 77 | ਜ
 78 | ਝ
 79 | ਞ
 80 | ਟ
 81 | ਠ
 82 | ਡ
 83 | ਢ
 84 | ਣ
 85 | ਤ
 86 | ਥ
 87 | ਦ
 88 | ਧ
 89 | ਨ
 90 | ਪ
 91 | ਫ
 92 | ਬ
 93 | ਭ
 94 | ਮ
 95 | ਯ
 96 | ਰ
 97 | ਲ
 98 | ਵ
 99 | ੜ
100 | ਸ
101 | ਹ
102 | 
103 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 | 
211 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | Å
29 | Ä
30 | Ö  
31 | #misc abbreviations
32 | #If all words in text are in small case, then tex, mao, tom, maj, may be confused with names, and iaf, etc with named entities.
33 | AB
34 | VG
35 | dvs
36 | d.v.s
37 | d. v. s
38 | etc
39 | from
40 | fr.o.m
41 | fr. o. m
42 | iaf
43 | i.a.f
44 | i. a. f
45 | jfr
46 | kl
47 | kr
48 | mao
49 | m.a.o
50 | m. a. o
51 | mfl
52 | m.fl
53 | m. fl
54 | mm
55 | m.m
56 | m. m.
57 | osv
58 | o.s.v
59 | o. s. v
60 | pga
61 | p.g.a
62 | p. g. a
63 | tex
64 | t.ex
65 | t. ex
66 | #tom. is risky, as tom is a word, and can be at end of sentence. One recent text has 9 tom., and 52 tom not at end of sentence. 
67 | tom
68 | t.o.m
69 | t. o. m
70 | vs
71 | adv
72 | jur
73 | kand
74 | mag
75 | fil
76 | lic
77 | prop
78 | d
79 | f
80 | s
81 | mha
82 | m.h.a
83 | m. h. a
84 | vol
85 | #months
86 | jan
87 | feb
88 | mar
89 | apr
90 | #maj is a full word
91 | jun
92 | jul
93 | aug
94 | sep
95 | okt
96 | nov
97 | dec
98 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Rs
 5 | ர
 6 | # Rs
 7 | ூ
 8 | # Mr
 9 | திரு
10 | 
11 | #others
12 | 
13 | 
14 | #phonetics
15 | # A
16 | ஏ
17 | # B
18 | பீ
19 | # C
20 | சீ
21 | # D
22 | டீ
23 | # E
24 | ஈ
25 | # F
26 | எஃப்
27 | # G
28 | ஜீ
29 | # H
30 | எச்
31 | ஹெச்
32 | # I
33 | ஐ
34 | # J
35 | ஜே
36 | ஜை
37 | # K
38 | கே
39 | # L
40 | எல்
41 | # M
42 | எம்
43 | # N
44 | என்
45 | # O
46 | ஓ
47 | # P
48 | ப்பீ
49 | # Q
50 | கியூ
51 | # R
52 | ஆர்
53 | # S
54 | எஸ்
55 | # T
56 | ட்டீ
57 | # U
58 | யூ
59 | # V
60 | வீ
61 | # W
62 | டபிள்-யூ
63 | # X
64 | எக்ஸ்
65 | # Y
66 | வை
67 | # Z
68 | செட்
69 | 
70 | #consonants
71 | 
72 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.te:
--------------------------------------------------------------------------------
 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 2 | 
 3 | #common exceptions
 4 | # Rs
 5 | ర
 6 | # Rs
 7 | ూ
 8 | # Mr
 9 | శ్రీ
10 | 
11 | #others
12 | 
13 | 
14 | #phonetics
15 | # A
16 | ఎ
17 | # B
18 | బి
19 | # C
20 | సి
21 | # D
22 | డి
23 | # E
24 | ఇ
25 | # F
26 | ఎఫ్
27 | # G
28 | జి
29 | # H
30 | హెచ్‌
31 | # I
32 | ఐ
33 | # J
34 | జె
35 | # K
36 | కె
37 | # L
38 | ఎల్
39 | # M
40 | ఎం
41 | ఎమ్
42 | # N
43 | ఎన్
44 | # O
45 | ఓ
46 | # P
47 | పి
48 | # Q
49 | క్యూ
50 | # R
51 | ఆర్
52 | # S
53 | ఎస్
54 | # T
55 | టి
56 | # U
57 | యు
58 | # V
59 | వి
60 | # W
61 | డబ్ల్యూ
62 | # X
63 | ఎక్స్
64 | # Y
65 | వై
66 | # Z
67 | జెడ్
68 | 
69 | #consonants
70 | 
71 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.yue:
--------------------------------------------------------------------------------
 1 | #
 2 | # Cantonese (Chinese)
 3 | #
 4 | # Anything in this file, followed by a period, 
 5 | # does NOT indicate an end-of-sentence marker.
 6 | #
 7 | # English/Euro-language given-name initials (appearing in
 8 | # news, periodicals, etc.)
 9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 | 
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 | 


--------------------------------------------------------------------------------
/moses/share/nonbreaking_prefixes/nonbreaking_prefix.zh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Mandarin (Chinese)
 3 | #
 4 | # Anything in this file, followed by a period, 
 5 | # does NOT indicate an end-of-sentence marker.
 6 | #
 7 | # English/Euro-language given-name initials (appearing in
 8 | # news, periodicals, etc.)
 9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 | 
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 | 


--------------------------------------------------------------------------------
/moses/tokenizer/deescape-special-chars.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | while(<STDIN>) {
10 |   s/\&bar;/\|/g;   # factor separator (legacy)
11 |   s/\&#124;/\|/g;  # factor separator
12 |   s/\&lt;/\</g;    # xml
13 |   s/\&gt;/\>/g;    # xml
14 |   s/\&bra;/\[/g;   # syntax non-terminal (legacy)
15 |   s/\&ket;/\]/g;   # syntax non-terminal (legacy)
16 |   s/\&quot;/\"/g;  # xml
17 |   s/\&apos;/\'/g;  # xml
18 |   s/\&#91;/\[/g;   # syntax non-terminal
19 |   s/\&#93;/\]/g;   # syntax non-terminal
20 |   s/\&amp;/\&/g;   # escape escape
21 |   print $_;
22 | }
23 | 


--------------------------------------------------------------------------------
/moses/tokenizer/escape-special-chars.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | while(<STDIN>) {
10 |   chop;
11 | 
12 |   # avoid general madness
13 |   s/[\000-\037]//g;
14 |   s/\s+/ /g;
15 | 	s/^ //g;
16 | 	s/ $//g;
17 | 
18 |   # special characters in moses
19 |   s/\&/\&amp;/g;   # escape escape
20 |   s/\|/\&#124;/g;  # factor separator
21 |   s/\</\&lt;/g;    # xml
22 |   s/\>/\&gt;/g;    # xml
23 |   s/\'/\&apos;/g;  # xml
24 |   s/\"/\&quot;/g;  # xml
25 |   s/\[/\&#91;/g;   # syntax non-terminal
26 |   s/\]/\&#93;/g;   # syntax non-terminal
27 | 
28 |   # restore xml instructions
29 |   s/\&lt;(\S+) translation=&quot;(.+?)&quot;&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
30 |   print $_."\n";
31 | }
32 | 


--------------------------------------------------------------------------------
/moses/tokenizer/lowercase.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | binmode(STDIN, ":utf8");
10 | binmode(STDOUT, ":utf8");
11 | 
12 | while(<STDIN>) {
13 |   print lc($_);
14 | }
15 | 


--------------------------------------------------------------------------------
/moses/tokenizer/normalize-punctuation.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | my $language = "en";
10 | my $PENN = 0;
11 | 
12 | while (@ARGV) {
13 |     $_ = shift;
14 |     /^-b$/ && ($| = 1, next); # not buffered (flush each line)
15 |     /^-l$/ && ($language = shift, next);
16 |     /^[^\-]/ && ($language = $_, next);
17 |   	/^-penn$/ && ($PENN = 1, next);
18 | }
19 | 
20 | while(<STDIN>) {
21 |     s/\r//g;
22 |     # remove extra spaces
23 |     s/\(/ \(/g;
24 |     s/\)/\) /g; s/ +/ /g;
25 |     s/\) ([\.\!\:\?\;\,])/\)$1/g;
26 |     s/\( /\(/g;
27 |     s/ \)/\)/g;
28 |     s/(\d) \%/$1\%/g;
29 |     s/ :/:/g;
30 |     s/ ;/;/g;
31 |     # normalize unicode punctuation
32 |     if ($PENN == 0) {
33 |       s/\`/\'/g;
34 |       s/\'\'/ \" /g;
35 |     }
36 | 
37 |     s/„/\"/g;
38 |     s/“/\"/g;
39 |     s/”/\"/g;
40 |     s/–/-/g;
41 |     s/—/ - /g; s/ +/ /g;
42 |     s/´/\'/g;
43 |     s/([a-z])‘([a-z])/$1\'$2/gi;
44 |     s/([a-z])’([a-z])/$1\'$2/gi;
45 |     s/‘/\"/g;
46 |     s/‚/\"/g;
47 |     s/’/\"/g;
48 |     s/''/\"/g;
49 |     s/´´/\"/g;
50 |     s/…/.../g;
51 |     # French quotes
52 |     s/ « / \"/g;
53 |     s/« /\"/g;
54 |     s/«/\"/g;
55 |     s/ » /\" /g;
56 |     s/ »/\"/g;
57 |     s/»/\"/g;
58 |     # handle pseudo-spaces
59 |     s/ \%/\%/g;
60 |     s/nº /nº /g;
61 |     s/ :/:/g;
62 |     s/ ºC/ ºC/g;
63 |     s/ cm/ cm/g;
64 |     s/ \?/\?/g;
65 |     s/ \!/\!/g;
66 |     s/ ;/;/g;
67 |     s/, /, /g; s/ +/ /g;
68 | 
69 |     # English "quotation," followed by comma, style
70 |     if ($language eq "en") {
71 | 	s/\"([,\.]+)/$1\"/g;
72 |     }
73 |     # Czech is confused
74 |     elsif ($language eq "cs" || $language eq "cz") {
75 |     }
76 |     # German/Spanish/French "quotation", followed by comma, style
77 |     else {
78 | 	s/,\"/\",/g;	
79 | 	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
80 |     }
81 | 
82 | 
83 |     if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
84 | 	s/(\d) (\d)/$1,$2/g;
85 |     }
86 |     else {
87 | 	s/(\d) (\d)/$1.$2/g;
88 |     }
89 |     print $_;
90 | }
91 | 


--------------------------------------------------------------------------------
/preprocess/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (NOT MSVC)
 2 | 	set(THREADS pthread)
 3 | endif()
 4 | 
 5 | add_library(fields STATIC fields.cc)
 6 | add_library(captive_child STATIC captive_child.cc)
 7 | add_library(warc STATIC warc.cc)
 8 | add_library(base64 STATIC base64.cc)
 9 | 
10 | # Explicitly list the executable files to be compiled
11 | set(EXE_LIST
12 |   b64filter
13 |   base64_number
14 |   cache
15 |   commoncrawl_dedupe
16 |   dedupe
17 |   docenc
18 |   foldfilter
19 |   gigaword_unwrap
20 |   idf
21 |   mmhsum
22 |   order_independent_hash
23 |   remove_invalid_utf8
24 |   remove_invalid_utf8_base64
25 |   remove_long_lines
26 |   shard
27 |   substitute
28 |   subtract_lines
29 |   vocab
30 |   warc_parallel
31 | )
32 | 
33 | set(ICU_EXE_LIST
34 |   apply_case
35 |   truecase
36 |   train_case
37 |   process_unicode
38 |   simple_cleaning
39 |   )
40 | if(USE_ICU)
41 |   set(EXE_LIST ${EXE_LIST} ${ICU_EXE_LIST})
42 | endif(USE_ICU)
43 | 
44 | set(PREPROCESS_LIBS preprocess_util ${Boost_LIBRARIES} ${THREADS})
45 | 
46 | foreach(exe ${EXE_LIST})
47 |   add_executable(${exe} ${exe}_main.cc)
48 |   target_link_libraries(${exe} ${PREPROCESS_LIBS})
49 |   set_target_properties(${exe} PROPERTIES FOLDER executables)
50 | endforeach(exe)
51 | 
52 | target_link_libraries(b64filter ${PREPROCESS_LIBS} base64 captive_child)
53 | target_link_libraries(base64_number ${PREPROCESS_LIBS} base64 captive_child)
54 | target_link_libraries(cache ${PREPROCESS_LIBS} fields captive_child)
55 | target_link_libraries(dedupe ${PREPROCESS_LIBS} fields)
56 | target_link_libraries(docenc ${PREPROCESS_LIBS} base64)
57 | target_link_libraries(foldfilter ${PREPROCESS_LIBS} captive_child)
58 | target_link_libraries(remove_invalid_utf8_base64 ${PREPROCESS_LIBS} base64)
59 | target_link_libraries(shard ${PREPROCESS_LIBS} fields)
60 | target_link_libraries(simple_cleaning ${PREPROCESS_LIBS} fields)
61 | target_link_libraries(substitute ${PREPROCESS_LIBS} fields)
62 | target_link_libraries(warc_parallel ${PREPROCESS_LIBS} warc captive_child)
63 | 
64 | if(USE_ICU)
65 |   foreach(exe ${ICU_EXE_LIST})
66 |     target_link_libraries(${exe} preprocess_icu)
67 |   endforeach(exe)
68 | endif(USE_ICU)
69 | 
70 | foreach(script text.sh gigaword_extract.sh resplit.sh unescape_html.perl heuristics.perl)
71 |   configure_file(${script} ../bin/${script} COPYONLY)
72 | endforeach()
73 | 


--------------------------------------------------------------------------------
/preprocess/apply_case_main.cc:
--------------------------------------------------------------------------------
  1 | #include "util/file_stream.hh"
  2 | #include "util/file_piece.hh"
  3 | #include "util/murmur_hash.hh"
  4 | #include "util/mutable_vocab.hh"
  5 | #include "util/tokenize_piece.hh"
  6 | #include "util/utf8.hh"
  7 | #include "util/utf8_icu.hh"
  8 | 
  9 | #include <unordered_map>
 10 | 
 11 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
 12 | #include <boost/lexical_cast.hpp>
 13 | 
 14 | namespace {
 15 | void SplitLine(util::FilePiece &from, std::vector<util::StringPiece> &to) {
 16 |   to.clear();
 17 |   for (util::TokenIter<util::SingleCharacter, true> i(from.ReadLine(), ' '); i; ++i) {
 18 |     to.push_back(*i);
 19 |   }
 20 | }
 21 | 
 22 | bool SameLine(util::FilePiece &f) {
 23 |   while (true) {
 24 |     switch(f.peek()) {
 25 |       case '\n':
 26 |         f.get();
 27 |         return false;
 28 |       case ' ':
 29 |       case '\t':
 30 |         f.get();
 31 |         continue;
 32 |       default:
 33 |         return true;
 34 |     }
 35 |   }
 36 | }
 37 | } // namespace
 38 | 
 39 | int main(int argc, char *argv[]) {
 40 |   if (argc != 5) {
 41 |     std::cerr << argv[0] << " alignment source target model" << std::endl;
 42 |     return 1;
 43 |   }
 44 |   util::FilePiece align(argv[1]), source_file(argv[2]), target_file(argv[3]), model(argv[4]);
 45 | 
 46 |   util::MutableVocab vocab;
 47 |   std::unordered_map<uint64_t, uint32_t> best;
 48 |   while (true) {
 49 |     uint64_t key;
 50 |     try {
 51 |       key = model.ReadULong();
 52 |     } catch (const util::EndOfFileException &e) { break; }
 53 |     uint64_t max_count = 0;
 54 |     util::StringPiece best_word;
 55 |     for (util::TokenIter<util::SingleCharacter, true> pair(model.ReadLine(), '\t'); pair; ++pair) {
 56 |       util::TokenIter<util::SingleCharacter> spaces(*pair, ' ');
 57 |       util::StringPiece word(*spaces);
 58 |       uint64_t count = boost::lexical_cast<uint64_t>(*++spaces);
 59 |       if (count > max_count) {
 60 |         max_count = count;
 61 |         best_word = word;
 62 |       }
 63 |       best[key] = vocab.FindOrInsert(best_word);
 64 |     }
 65 |   }
 66 | 
 67 |   std::cerr << "Read model." << std::endl;
 68 | 
 69 |   std::vector<util::StringPiece> source_words, target_words;
 70 |   std::string lowered;
 71 |   util::FileStream out(1);
 72 |   for (std::size_t line = 0; ; ++line) {
 73 |     try {
 74 |       SplitLine(source_file, source_words);
 75 |     } catch (const util::EndOfFileException &e) { break; }
 76 |     SplitLine(target_file, target_words);
 77 |     align.ReadULong();
 78 |     UTIL_THROW_IF2("|||" != align.ReadDelimited(), "Expected |||");
 79 |     while (SameLine(align)) {
 80 |       unsigned long first = align.ReadULong();
 81 |       UTIL_THROW_IF2(align.get() != '-', "Bad alignment");
 82 |       UTIL_THROW_IF2(align.peek() < '0' || align.peek() > '9', "Expected number for alignment, not " << align.peek());
 83 |       unsigned long second = align.ReadULong();
 84 |       UTIL_THROW_IF2(first >= source_words.size(), "Index " << first << " too high for source text at line " << line << " which has size " << source_words.size());
 85 |       UTIL_THROW_IF2(second >= target_words.size(), "Index " << second << " too high for target text at line " << line << " which has size " << target_words.size());
 86 |       util::ToLower(target_words[second], lowered);
 87 |       util::StringPiece source(source_words[first]);
 88 |       uint64_t key = util::MurmurHash64A(lowered.data(), lowered.size(), util::MurmurHash64A(source.data(), source.size()));
 89 |       std::unordered_map<uint64_t, uint32_t>::const_iterator found = best.find(key);
 90 |       if (found != best.end()) {
 91 |         target_words[second] = vocab.String(found->second);
 92 |       }
 93 |     }
 94 |     std::vector<util::StringPiece>::const_iterator i = target_words.begin();
 95 |     if (i != target_words.end()) out << *i;
 96 |     for (++i; i != target_words.end(); ++i) {
 97 |       out << ' ' << *i;
 98 |     }
 99 |     out << '\n';
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/preprocess/b64filter_main.cc:
--------------------------------------------------------------------------------
  1 | #include <thread>
  2 | #include <unistd.h>
  3 | #include "preprocess/base64.hh"
  4 | #include "preprocess/captive_child.hh"
  5 | #include "util/exception.hh"
  6 | #include "util/file_stream.hh"
  7 | #include "util/file_piece.hh"
  8 | #include "util/pcqueue.hh"
  9 | 
 10 | 
 11 | namespace {
 12 | 
 13 | struct Document {
 14 | 	size_t line_cnt;
 15 | 	bool has_trailing_newline;
 16 | };
 17 | 
 18 | } // namespace
 19 | 
 20 | int main(int argc, char **argv) {
 21 | 	if (argc < 2) {
 22 | 		std::cerr << "usage: " << argv[0] << " command [command-args...]\n";
 23 | 		return 1;
 24 | 	}
 25 | 
 26 | 	util::UnboundedSingleQueue<Document> line_cnt_queue;
 27 | 
 28 | 	util::scoped_fd child_in_fd, child_out_fd;
 29 | 
 30 | 	pid_t child = preprocess::Launch(argv + 1, child_in_fd, child_out_fd);
 31 | 
 32 | 	std::thread feeder([&child_in_fd, &line_cnt_queue]() {
 33 | 		util::FilePiece in(STDIN_FILENO);
 34 | 		util::FileStream child_in(child_in_fd.release());
 35 | 
 36 | 		// Decoded document buffer
 37 | 		std::string doc;
 38 | 
 39 | 		for (util::StringPiece line : in) {
 40 | 			preprocess::base64_decode(line, doc);
 41 | 
 42 | 			// Description of the document
 43 | 			Document doc_desc{
 44 | 				.line_cnt = 0,
 45 | 				.has_trailing_newline = doc.back() == '\n',
 46 | 			};
 47 | 
 48 | 			// Make the the document end with a new line. This to make sure
 49 | 			// the next doc we send to the child will be on its own line and the
 50 | 			// line_cnt is correct.
 51 | 			if (!doc_desc.has_trailing_newline)
 52 | 				doc.push_back('\n');
 53 | 
 54 | 			doc_desc.line_cnt = count(doc.cbegin(), doc.cend(), '\n');
 55 | 			
 56 | 			// Send line count first to the reader, so it can start reading as
 57 | 			// soon as we start feeding the document to the child.
 58 | 			line_cnt_queue.Produce(std::move(doc_desc));
 59 | 
 60 | 			// Feed the document to the child.
 61 | 			// Might block because it can cause a flush.
 62 | 			child_in << doc;
 63 | 		}
 64 | 
 65 | 		// Tell the reader to stop
 66 | 		line_cnt_queue.Produce(Document{
 67 | 			.line_cnt = 0,
 68 | 			.has_trailing_newline = false
 69 | 		});
 70 | 
 71 | 		// Flush (blocks).  The FileStream destructor closes.
 72 | 		child_in.flush();
 73 | 	});
 74 | 
 75 | 	std::thread reader([&child_out_fd, &line_cnt_queue]() {
 76 | 		util::FileStream out(STDOUT_FILENO);
 77 | 		util::FilePiece child_out(child_out_fd.release());
 78 | 
 79 | 		size_t doc_cnt = 0;
 80 | 		Document document;
 81 | 		std::string doc;
 82 | 
 83 | 		while (line_cnt_queue.Consume(document).line_cnt > 0) {
 84 | 			++doc_cnt;
 85 | 
 86 | 			doc.clear();
 87 | 			doc.reserve(document.line_cnt * 4096); // 4096 is not a typical line length
 88 | 
 89 | 			try {
 90 | 				while (document.line_cnt-- > 0) {
 91 |         util::StringPiece line(child_out.ReadLine());
 92 | 					doc.append(line.data(), line.length());
 93 | 
 94 | 					// ReadLine eats line endings. Between lines we definitely
 95 | 					// need to add them back. Whether we add the last one depends
 96 | 					// on whether the original document had a trailing newline.
 97 | 					if (document.line_cnt > 0 || document.has_trailing_newline)
 98 | 						doc.push_back('\n');
 99 | 				}
100 | 			} catch (util::EndOfFileException &e) {
101 | 				UTIL_THROW(util::Exception, "Sub-process stopped producing while expecting more lines while processing document " << doc_cnt);
102 | 			}
103 | 
104 | 			std::string encoded_doc;
105 | 			preprocess::base64_encode(doc, encoded_doc);
106 | 			out << encoded_doc << '\n';
107 | 		}
108 | 
109 | 		// Assert that we have consumed all the output of the child program.
110 | 		try {
111 | 			// peek() should now fail on an end of file, the loop above should
112 | 			// already have consumed all output that's there.
113 | 			child_out.peek();
114 | 
115 | 			UTIL_THROW(util::Exception, "sub-process is producing more output than it was given input");
116 | 		} catch (util::EndOfFileException &e) {
117 | 			// Good!
118 | 		}
119 | 	});
120 | 
121 | 	int retval = preprocess::Wait(child);
122 | 
123 | 	feeder.join();
124 | 	reader.join();
125 | 	
126 | 	return retval;
127 | }
128 | 


--------------------------------------------------------------------------------
/preprocess/base64.cc:
--------------------------------------------------------------------------------
 1 | #include "base64.hh"
 2 | #include <vector>
 3 | #include <cmath>
 4 | #include "util/exception.hh"
 5 | 
 6 | namespace preprocess {
 7 | 
 8 | namespace {
 9 | 
10 | char const *TABLE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
11 | 
12 | int const INV_TABLE[256] = {
13 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
14 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
15 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
16 | 	52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
17 | 	-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
18 | 	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
19 | 	-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
20 | 	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1
21 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
22 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
23 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
24 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
25 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
27 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
28 | 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
29 | };
30 | 
31 | size_t count_padding(const util::StringPiece &in) {
32 | 	const char *data = in.data();
33 | 
34 | 	for (int32_t i = 1; i <= in.size(); ++i)
35 | 		if (data[in.size() - i] != '=')
36 | 			return i - 1;
37 | 
38 | 	return in.size();
39 | }
40 | 
41 | } // namespace
42 | 
43 | void base64_encode(const util::StringPiece &in, std::string &out) {
44 | 	out.clear();
45 | 	out.reserve(4 * ((in.size() + 2) / 3));
46 | 
47 | 	int val = 0, valb = -6;
48 | 
49 | 	for (const unsigned char *c = reinterpret_cast<const unsigned char*>(in.data()); c != reinterpret_cast<const unsigned char*>(in.data()) + in.size(); ++c) {
50 | 		val = (val << 8) + *c;
51 | 		valb += 8;
52 | 		while (valb >= 0) {
53 | 			out.push_back(TABLE[(val >> valb) & 0x3F]);
54 | 			valb -= 6;
55 | 		}
56 | 	}
57 | 
58 | 	if (valb >- 6)
59 | 		out.push_back(TABLE[((val << 8) >> (valb + 8)) & 0x3F]);
60 | 
61 | 	while (out.size() % 4)
62 | 		out.push_back('=');
63 | }
64 | 
65 | void base64_decode(const util::StringPiece &in, std::string &out) {
66 | 	out.clear();
67 | 
68 | 	// Reserve worst case scenario memory
69 | 	out.reserve(in.size() * 3 / 4 - count_padding(in));
70 | 
71 | 	int val = 0, valb = -8;
72 | 	for (const unsigned char *c = reinterpret_cast<const unsigned char*>(in.data()); c != reinterpret_cast<const unsigned char*>(in.data()) + in.size(); ++c) {
73 | 		// Padding reached
74 | 		if (*c == '=')
75 | 			break;
76 | 		
77 | 		UTIL_THROW_IF(INV_TABLE[*c] == -1, util::Exception, "Cannot interpret character '" << *c << "' as part of base64");
78 | 		
79 | 		val = (val << 6) + INV_TABLE[*c];
80 | 		valb += 6;
81 | 		if (valb >= 0) {
82 | 			out.push_back(char((val >> valb) & 0xFF));
83 | 			valb -= 8;
84 | 		}
85 | 	}
86 | }
87 | 
88 | } // namespace preprocess
89 | 


--------------------------------------------------------------------------------
/preprocess/base64.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | #include "util/string_piece.hh"
 4 | 
 5 | namespace preprocess {
 6 | 
 7 | void base64_encode(const util::StringPiece &in, std::string &out);
 8 | 
 9 | void base64_decode(const util::StringPiece &in, std::string &out);
10 | 
11 | } // namespace preprocess
12 | 


--------------------------------------------------------------------------------
/preprocess/base64_number_main.cc:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include "util/file_piece.hh"
 3 | #include "util/file_stream.hh"
 4 | #include "util/tokenize_piece.hh"
 5 | #include "preprocess/base64.hh"
 6 | 
 7 | #include <algorithm>
 8 | #include <string>
 9 | 
10 | int main(int argc, char *argv[]) {
11 |   std::string out;
12 |   util::FileStream writing(1);
13 |   uint64_t line_number = 0;
14 |   for (util::StringPiece l : util::FilePiece(0)) {
15 |     preprocess::base64_decode(l, out);
16 |     std::replace(out.begin(), out.end(), '\t', ' ');
17 |     for (util::TokenIter<util::SingleCharacter, true> line(out, '\n'); line; ++line) {
18 |       writing << *line << '\t' << line_number << '\n';
19 |     }
20 |     ++line_number;
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/preprocess/captive_child.cc:
--------------------------------------------------------------------------------
 1 | #include "preprocess/captive_child.hh"
 2 | 
 3 | #include "util/exception.hh"
 4 | #include "util/file.hh"
 5 | 
 6 | #include <signal.h>
 7 | #ifdef __linux__
 8 | #include <sys/prctl.h>
 9 | #endif
10 | #include <fcntl.h>
11 | #include <sys/types.h>
12 | #include <sys/wait.h>
13 | #include <unistd.h>
14 | 
15 | #include <iostream>
16 | 
17 | namespace preprocess {
18 | 
19 | namespace {
20 | void Pipe(util::scoped_fd &first, util::scoped_fd &second) {
21 |   int fds[2];
22 |   UTIL_THROW_IF(pipe(fds), util::ErrnoException, "Creating pipe failed");
23 |   first.reset(fds[0]);
24 |   second.reset(fds[1]);
25 | }
26 | } // namespace
27 | 
28 | pid_t Launch(char *argv[], util::scoped_fd &in, util::scoped_fd &out) {
29 |   util::scoped_fd process_in, process_out;
30 |   Pipe(process_in, in);
31 |   Pipe(out, process_out);
32 | 
33 |   // Using self-pipe trick to check whether execvp did not fail: Set up a pipe
34 |   // with FD_CLOEXEC (close on successful exec). In case of failure, we'll
35 |   // write something to the pipe and close it manually. Then, in the parent we
36 |   // can wait till the pipe is closed: either execvp succeeded and we read
37 |   // nothing or we read our error code and throw an exception in the parent.
38 |   // (See https://stackoverflow.com/a/1586277)
39 |   util::scoped_fd status_in, status_out;
40 |   Pipe(status_in, status_out);
41 |   UTIL_THROW_IF(fcntl(status_out.get(), F_SETFD, fcntl(status_out.get(), F_GETFD) | FD_CLOEXEC), util::ErrnoException, "fcntl failed");
42 | 
43 |   pid_t pid = fork();
44 |   UTIL_THROW_IF(pid == -1, util::ErrnoException, "Fork failed");
45 |   if (pid == 0) {
46 |     // Inside child process.
47 |     #ifdef __linux__
48 |     prctl(PR_SET_PDEATHSIG, SIGTERM);
49 |     #endif
50 |     UTIL_THROW_IF(-1 == dup2(process_in.get(), STDIN_FILENO), util::ErrnoException, "dup2 failed for process stdin from " << process_in.get());
51 |     UTIL_THROW_IF(-1 == dup2(process_out.get(), STDOUT_FILENO), util::ErrnoException, "dup2 failed for process stdout from " << process_out.get());
52 |     in.reset();
53 |     out.reset();
54 |     status_in.reset();
55 |     execvp(argv[0], argv);
56 |     // Oh no, execvp failed, write error to parent
57 |     write(status_out.get(), &errno, sizeof(int));
58 |     std::abort();
59 |   }
60 |   status_out.reset();
61 | 
62 |   // Wait on child to signal successful execvp or error
63 |   int count, err;
64 |   while ((count = read(*status_in, &err, sizeof(errno))) == -1)
65 |     if (errno != EAGAIN && errno != EINTR)
66 |       break;
67 | 
68 |   UTIL_THROW_IF(count != 0, util::Exception, "child's execvp failed: " << strerror(err));
69 |   
70 |   // Parent closes parts it doesn't need in destructors.
71 |   return pid;
72 | }
73 | 
74 | int Wait(pid_t child) {
75 |   int status;
76 |   UTIL_THROW_IF(-1 == waitpid(child, &status, 0), util::ErrnoException, "waitpid for child failed");
77 |   if (WIFEXITED(status)) {
78 |     return WEXITSTATUS(status);
79 |   } else {
80 |     return 256;
81 |   }
82 | }
83 | 
84 | } // namespace preprocess
85 | 
86 | 


--------------------------------------------------------------------------------
/preprocess/captive_child.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <sys/types.h>
 4 | 
 5 | namespace util { class scoped_fd; }
 6 | 
 7 | namespace preprocess {
 8 | 
 9 | // Launch a child process.  The child's stdin and stdout pipes will be returned as in and out.
10 | pid_t Launch(char *argv[], util::scoped_fd &in, util::scoped_fd &out);
11 | 
12 | // Wait for a child to finish and return an appropriate status for it.
13 | int Wait(pid_t child);
14 | 
15 | } // namespace preprocess
16 | 


--------------------------------------------------------------------------------
/preprocess/commoncrawl_dedupe_main.cc:
--------------------------------------------------------------------------------
 1 | // Tool to convert raw CommonCrawl files into deduplicated files.
 2 | // Strips leading and trailing spaces.
 3 | // Removes document delimiter lines (those that begin with df6fa1abb58549287111ba8d776733e9).
 4 | // Removes duplicate lines.
 5 | // Removes any line that contains invalid UTF-8.
 6 | //
 7 | #include "util/file_stream.hh"
 8 | #include "util/file_piece.hh"
 9 | #include "util/murmur_hash.hh"
10 | #include "util/probing_hash_table.hh"
11 | #include "util/scoped.hh"
12 | #include "util/utf8.hh"
13 | 
14 | #include <iostream>
15 | 
16 | #include <stdint.h>
17 | 
18 | namespace {
19 | 
20 | // Hash table with 64-bit keys.
21 | struct Entry {
22 |   typedef uint64_t Key;
23 |   uint64_t key;
24 |   uint64_t GetKey() const { return key; }
25 |   void SetKey(uint64_t to) { key = to; }
26 | };
27 | 
28 | typedef util::AutoProbing<Entry, util::IdentityHash> Table;
29 | 
30 | // Use 64-bit MurmurHash in the hash table.  
31 | bool IsNewLine(Table &table, util::StringPiece l) {
32 |   Table::MutableIterator it;
33 |   Entry entry;
34 |   entry.key = util::MurmurHashNative(l.data(), l.size(), 1);
35 |   return !table.FindOrInsert(entry, it);
36 | }
37 | 
38 | // Remove leading and trailing space characters.
39 | util::StringPiece StripSpaces(util::StringPiece ret) {
40 |   while (ret.size() && util::kSpaces[static_cast<unsigned char>(*ret.data())]) {
41 |     ret = util::StringPiece(ret.data() + 1, ret.size() - 1);
42 |   }
43 |   while (ret.size() && util::kSpaces[static_cast<unsigned char>(ret.data()[ret.size() - 1])]) {
44 |     ret = util::StringPiece(ret.data(), ret.size() - 1);
45 |   }
46 |   return ret;
47 | }
48 | 
49 | 
50 | } // namespace
51 | 
52 | int main(int argc, char *argv[]) {
53 |   if (argc > 2 || (argc == 2 && (!strcmp("-h", argv[1]) || !strcmp("--help", argv[1])))) {
54 |     std::cerr << "Usage: " << argv[0] << " file_to_remove\nLines that appear in file_to_remove will be excluded from the output.\n" << std::endl;
55 |     return 1;
56 |   }
57 |   try {
58 |     Table table;
59 |     util::StringPiece l;
60 | 
61 |     // If there's a file to remove lines from, add it to the hash table of lines.
62 |     if (argc == 2) {
63 |       util::FilePiece removing(argv[1]);
64 |       while (removing.ReadLineOrEOF(l)) {
65 |         IsNewLine(table, StripSpaces(l));
66 |       }
67 |     }
68 | 
69 |     // This is the beginning of a line that delimits documents in the raw files.
70 |     const util::StringPiece remove_line("df6fa1abb58549287111ba8d776733e9");
71 |     util::FileStream out(1);
72 |     util::FilePiece in(0, "stdin", &std::cerr);
73 |     while (in.ReadLineOrEOF(l)) {
74 |       l = StripSpaces(l);
75 |       // A line passes if:
76 |       // It does not begin with the magic document delimiter.
77 |       // Its 64-bit hash has not been seen before.
78 |       // and it is valid UTF-8.
79 |       if (!starts_with(l, remove_line) && IsNewLine(table, l) && util::IsUTF8(l)) {
80 |         out << l << '\n';
81 |       }
82 |     }
83 |   } 
84 |   catch (const std::exception &e) {
85 |     std::cerr << e.what() << std::endl;
86 |     return 1;
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/preprocess/dedupe_main.cc:
--------------------------------------------------------------------------------
  1 | #include "fields.hh"
  2 | #include "parallel.hh"
  3 | #include "util/murmur_hash.hh"
  4 | #include "util/probing_hash_table.hh"
  5 | #include "util/scoped.hh"
  6 | 
  7 | #include <boost/program_options.hpp>
  8 | #include <boost/program_options/positional_options.hpp>
  9 | 
 10 | #include <iostream>
 11 | 
 12 | #include <stdint.h>
 13 | 
 14 | namespace preprocess {
 15 | namespace {
 16 | 
 17 | struct Options {
 18 |   std::vector<FieldRange> key_fields;
 19 |   char delim;
 20 |   std::vector<std::string> files;
 21 | };
 22 | 
 23 | void ParseArgs(int argc, char *argv[], Options &out) {
 24 |   namespace po = boost::program_options;
 25 |   po::options_description desc("Deduplication settings");
 26 |   std::string fields;
 27 | 
 28 |   desc.add_options()
 29 |     ("help,h", po::bool_switch(), "Show this help message")
 30 |     ("fields,f", po::value(&fields)->default_value("1-"), "Fields to use for key like cut -f")
 31 |     ("delim,d", po::value(&out.delim)->default_value('\t'), "Field delimiter")
 32 |     ("parallel,p", po::value(&out.files)->multitoken(), "Filter parallel data using four files: in_en in_fr out_en out_fr");
 33 |   po::positional_options_description pd;
 34 |   pd.add("parallel", -1);
 35 | 
 36 |   po::variables_map vm;
 37 |   po::store(po::command_line_parser(argc, argv).options(desc).positional(pd).run(), vm);
 38 |   if (vm["help"].as<bool>() || (!out.files.empty() && out.files.size() != 4)) {
 39 |     std::cerr <<
 40 |       "Deduplicate lines in a file.\n"
 41 |       "Only 64-bit hashes are kept.  In the event of a hash collision, a unique line\n"
 42 |       "will be removed.\n"
 43 |       "By default the entire line is used as the key for equality.  Using -f and -d\n"
 44 |       "similar to cut, the key can be restricted to some columns.  The line containing\n"
 45 |       "the first instance of the key is preserved, while the rest are removed.\n" <<
 46 |       desc <<
 47 |       "Deduplicate lines in a file: " << argv[0] << " <in >out\n"
 48 |       "Deduplicate parallel data, removing if either side is non-unique " << argv[0] << " -p in_en in_fr out_en out_fr\n";
 49 |     exit(1);
 50 |   }
 51 |   po::notify(vm);
 52 | 
 53 |   ParseFields(fields.c_str(), out.key_fields);
 54 |   DefragmentFields(out.key_fields);
 55 | }
 56 | 
 57 | struct Entry {
 58 |   typedef uint64_t Key;
 59 |   uint64_t key;
 60 |   uint64_t GetKey() const { return key; }
 61 |   void SetKey(uint64_t to) { key = to; }
 62 | };
 63 | 
 64 | class Dedupe {
 65 |   public:
 66 |     bool operator()(const util::StringPiece &line) {
 67 |       return (*this)(util::MurmurHashNative(line.data(), line.size(), 1));
 68 |     }
 69 | 
 70 |     bool operator()(uint64_t key) {
 71 |       Entry entry;
 72 |       entry.key = key;
 73 |       Table::MutableIterator it;
 74 |       return !table_.FindOrInsert(entry, it);
 75 |     }
 76 | 
 77 |   private:
 78 |     typedef util::AutoProbing<Entry, util::IdentityHash> Table;
 79 |     Table table_;
 80 | };
 81 | 
 82 | class FieldDedupe : public Dedupe {
 83 |   public:
 84 |     explicit FieldDedupe(const Options &options)
 85 |       : key_fields_(options.key_fields), delim_(options.delim) {}
 86 | 
 87 |     bool operator()(const util::StringPiece &line) {
 88 |       HashCallback hasher(1);
 89 |       RangeFields(line, key_fields_, delim_, hasher);
 90 |       return (*static_cast<Dedupe*>(this))(hasher.Hash());
 91 |     }
 92 | 
 93 |   private:
 94 |     std::vector<FieldRange> key_fields_;
 95 |     char delim_;
 96 | };
 97 | 
 98 | } // namespace
 99 | } // namespace preprocess
100 | 
101 | int main(int argc, char *argv[]) {
102 |   preprocess::Options options;
103 |   ParseArgs(argc, argv, options);
104 | 
105 |   if (options.key_fields.size() == 1 && options.key_fields[0].begin == 0 && options.key_fields[0].end == preprocess::FieldRange::kInfiniteEnd) {
106 |     return preprocess::FilterParallel<preprocess::Dedupe>(options.files);
107 |   } else {
108 |     return preprocess::FilterParallel<preprocess::FieldDedupe>(options.files, options);
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/preprocess/fields.cc:
--------------------------------------------------------------------------------
 1 | #include "preprocess/fields.hh"
 2 | #include "util/exception.hh"
 3 | 
 4 | #include <stdlib.h>
 5 | 
 6 | #include <algorithm>
 7 | 
 8 | namespace preprocess {
 9 | 
10 | namespace {
11 | unsigned int ConsumeInt(const char *&arg) {
12 |   char *end;
13 |   unsigned int ret = strtoul(arg, &end, 10);
14 |   UTIL_THROW_IF(end == arg, util::Exception, "Expected field " << arg << " to begin with a number.");
15 |   arg = end;
16 |   return ret;
17 | }
18 | } // namespace
19 | 
20 | void ParseFields(const char *arg, std::vector<FieldRange> &indices) {
21 |   FieldRange add;
22 |   while (*arg) {
23 |     if (*arg == '-') {
24 |       add.begin = 0;
25 |     } else {
26 |       // -1 because cut is 1-indexed.
27 |       add.begin = ConsumeInt(arg) - 1;
28 |     }
29 |     switch (*arg) {
30 |       case ',': case 0:
31 |         add.end = add.begin + 1;
32 |         break;
33 |       case '-':
34 |         ++arg;
35 |         if (*arg == 0 || *arg == ',') {
36 |           // 5-
37 |           add.end = FieldRange::kInfiniteEnd;
38 |         } else {
39 |           // 5-6
40 |           add.end = ConsumeInt(arg);
41 |           UTIL_THROW_IF(add.end <= add.begin, util::Exception, "Empty range [" << add.begin << ", " << add.end << ")");
42 |         }
43 |         break;
44 |       default:
45 |         UTIL_THROW(util::Exception, "Expected , - or string end after number in " << arg);
46 |     }
47 |     // Swallow ,
48 |     if (*arg == ',') {
49 |       ++arg;
50 |     }
51 |     indices.push_back(add);
52 |   }
53 | }
54 | 
55 | void DefragmentFields(std::vector<FieldRange> &indices) {
56 |   std::sort(indices.begin(), indices.end());
57 |   for (unsigned int i = 1; i < indices.size();) {
58 |     UTIL_THROW_IF(indices[i-1].end > indices[i].begin, util::Exception, "Overlapping index ranges");
59 |     if (indices[i-1].end == indices[i].begin) {
60 |       indices[i-1].end = indices[i].end;
61 |       indices.erase(indices.begin() + i);
62 |     } else {
63 |       ++i;
64 |     }
65 |   }
66 | }
67 | 
68 | } // namespace preprocess
69 | 


--------------------------------------------------------------------------------
/preprocess/fields.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util/string_piece.hh"
 4 | #include "util/murmur_hash.hh"
 5 | 
 6 | #include <algorithm>
 7 | #include <limits>
 8 | #include <vector>
 9 | 
10 | namespace preprocess {
11 | 
12 | // [begin, end) as is the custom of our people.
13 | struct FieldRange {
14 |   // Note that end can be the maximum integer.
15 |   unsigned int begin, end;
16 |   bool operator<(const FieldRange &other) const {
17 |     return begin < other.begin;
18 |   }
19 |   static const unsigned int kInfiniteEnd = std::numeric_limits<unsigned int>::max();
20 | };
21 | 
22 | // Parse the cut-style 1-3,9,12- representation of fields.
23 | void ParseFields(const char *arg, std::vector<FieldRange> &indices);
24 | 
25 | // Sort and combine field ranges into smaller ones.
26 | void DefragmentFields(std::vector<FieldRange> &indices);
27 | 
28 | // Do a callback with each individual field that was selected.
29 | template <class Functor> inline bool IndividualFields(util::StringPiece str, const std::vector<FieldRange> &indices, char delim, Functor &callback) {
30 |   const char *begin = str.data();
31 |   const char *const end = str.data() + str.size();
32 |   unsigned int index = 0;
33 |   for (const FieldRange f : indices) {
34 |     for (; index < f.begin; ++index) {
35 |       begin = std::find(begin, end, delim) + 1;
36 |       if (begin >= end) return true;
37 |     }
38 |     for (; index < f.end; ++index) {
39 |       const char *found = std::find(begin, end, delim);
40 |       if (!callback(util::StringPiece(begin, found - begin))) {
41 |         return false;
42 |       }
43 |       begin = found + 1;
44 |       if (begin >= end) return true;
45 |     }
46 |   }
47 |   return true;
48 | }
49 | 
50 | // Do a callback with ranges of fields.
51 | template <class Functor> inline void RangeFields(util::StringPiece str, const std::vector<FieldRange> &indices, char delim, Functor &callback) {
52 |   const char *begin = str.data();
53 |   const char *const end = str.data() + str.size();
54 |   unsigned int index = 0;
55 |   for (const FieldRange f : indices) {
56 |     for (; index < f.begin; ++index) {
57 |       begin = std::find(begin, end, delim) + 1;
58 |       if (begin >= end) return;
59 |     }
60 |     if (f.end == FieldRange::kInfiniteEnd) {
61 |       callback(util::StringPiece(begin, end - begin));
62 |       return;
63 |     }
64 |     const char *old_begin = begin;
65 |     for (; index < f.end; ++index) {
66 |       const char *found = std::find(begin, end, delim);
67 |       begin = found + 1;
68 |       if (begin >= end) {
69 |         callback(util::StringPiece(old_begin, end - old_begin));
70 |         return;
71 |       }
72 |     }
73 |     callback(util::StringPiece(old_begin, begin - old_begin - 1));
74 |   }
75 |   return;
76 | }
77 | 
78 | // This is called with the parts of the input that relate to the key.
79 | class HashCallback {
80 |   public:
81 |     explicit HashCallback(uint64_t seed = 47849374332489ULL) : hash_(seed) /* Be different from deduper */ {}
82 | 
83 |     void operator()(util::StringPiece key) {
84 |       hash_ = util::MurmurHashNative(key.data(), key.size(), hash_);
85 |     }
86 | 
87 |     uint64_t Hash() const { return hash_; }
88 | 
89 |   private:
90 |     uint64_t hash_;
91 | };
92 | 
93 | } // namespace preprocess
94 | 


--------------------------------------------------------------------------------
/preprocess/gigaword_extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Extract sentences from gigaword but don't process them
 3 | set -e -o pipefail
 4 | BINDIR="$(dirname "$0")"
 5 | if [ ${#1} != 2 ]; then
 6 |   echo "Expected language on the command line." 1>&2
 7 |   exit 1
 8 | fi
 9 | $BINDIR/gigaword_unwrap | $BINDIR/../moses/ems/support/split-sentences.perl -l $1 |fgrep -v "<P>"
10 | 


--------------------------------------------------------------------------------
/preprocess/heuristics.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #More preprocessing.  This assumes that process_unicode is run with at minimum --flatten 1 --normalize 1 first.  
 3 | 
 4 | use strict;
 5 | use utf8;
 6 | 
 7 | binmode STDIN, ":utf8";
 8 | binmode STDOUT, ":utf8";
 9 | binmode STDERR, ":utf8";
10 | 
11 | my $language = "en";
12 | 
13 | while (@ARGV) {
14 |   $_ = shift;
15 |   /^-l$/ && ($language = shift, next);
16 | }
17 | 
18 | while(my $eline = <STDIN>)
19 | {
20 |   chomp $eline;
21 |   $eline = " $eline ";
22 | 
23 |   #Normalize long chains of underscores to just two.
24 |   $eline =~ s/_\s*_[\s_]*/ __ /g;
25 | 
26 |   #Silja dropped * entirely.  I keep one.  Bullet points are converted to * by a Chris Dyer rule in process_unicode.   
27 |   $eline =~ s/\*\s*\*[\s\*]*/ * /g;
28 |   #Silja, originally for prepgigaword-silja.pl
29 |   $eline =~ s/#+//g;
30 |   $eline =~ s/[\!]+/!/g;
31 |   $eline =~ s/!([^ ])/! $1/g;
32 |   $eline =~ s/\.([^\s\d.])/. $1/g;
33 |   $eline =~ s/\+(\D)/+ $1/g;
34 |   $eline =~ s/(\D)\+/$1 +/g;
35 |   $eline =~ s/,(\D)/, $1/g;
36 |   $eline =~ s/(\s)-([^\s\d\-])/$1- $2/g;
37 |   $eline =~ s/^ *-- *//g;
38 |   #The next rule was botching ellipses. . . 
39 |   #$eline =~ s/\.\./ . /g;
40 | 
41 |   #Greg
42 |   #Gigaword apw does this.  
43 |   $eline =~ s/ dlrs / \$ /g;
44 |   if ($language == "fr") {
45 |     $eline =~ s/([^ -]+)-t-(je|j'|tu|il|elle|on|nous|vous|ils|elles|me|m'|te|t'|le|l'|la|les|lui|leur|moi|toi|eux|elles|ce|c'|ça|ceci|cela|qui|ci|là) /\1 -t-\2 /gi;
46 |     $eline =~ s/([^ -]+)-(je|j'|tu|il|elle|on|nous|vous|ils|elles|me|m'|te|t'|le|l'|la|les|lui|leur|moi|toi|eux|elles|ce|c'|ça|ceci|cela|qui|ci|là) /\1 -\2 /gi;
47 |     $eline =~ s/\s+(qu|c|d|l|j|s|n|m|lorsqu|puisqu)\s+'\s+/ \1' /gi;
48 |     $eline =~ s/\s+aujourd\s*'\s*hui\s+/ aujourd'hui /gi;
49 |   }
50 | 
51 |   #Chris Dyer, t2.perl
52 |   if ($language == "en") {
53 |     $eline =~ s/ élite / elite /gi;
54 |     $eline =~ s/ (s|at) & (t|p) / $1&$2 /ig;
55 |     $eline =~ s/ (full|half|part) - (time) / $1-$2 /ig;
56 |     $eline =~ s/ (vis|viz) - (.|..) - (vis|viz) / vis-à-vis /ig;
57 |     $eline =~ s/ (short|long|medium|one|half|two|on|off|in|post|ex|multi|de|mid|co|inter|intra|anti|re|pre|e|non|pro|self) - / $1- /ig;
58 | 
59 |     #kheafiel
60 |     $eline =~ s/ (ca|are|do|could|did|does|do|had|has|have|is|must|need|should|was|were|wo|would)n 't / \1n't /gi;
61 |   }
62 |   $eline =~ s/ ([AaEe][Ll]) - / \1-/g;
63 | 
64 |   if ($language != "de") {
65 |     #Take out any "words" that are longer than 50 chars
66 |     $eline =~ s/\S{50,}/-/g;
67 |   }
68 | 
69 |   $eline =~ s/\.\s*\.\s*\.\s*[\.\s]*/ ... /g;
70 |   $eline =~ s/!\s*![!\s]*/ ! /g;
71 |   $eline =~ s/\?\s*\?[\?\s]*/ ? /g;
72 |   $eline =~ s/ ' s / 's /g;
73 |   #cut multiple hyphens down to one and space separate it (single hyphens are not space separated) 
74 |   $eline =~ s/([^-])--+([^-])/$1 - $2/g;
75 | 
76 |   #Delete excess spaces:
77 |   $eline =~ s/\s+/ /g;
78 |   $eline =~ s/^\s+//;
79 |   $eline =~ s/\s+$//;
80 | 
81 |   print "$eline\n";
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/preprocess/idf_main.cc:
--------------------------------------------------------------------------------
 1 | /* Computes inverse document frequency for each token seen in the input.  A document is a line. */
 2 | #include "util/file_piece.hh"
 3 | #include "util/murmur_hash.hh"
 4 | #include "util/pool.hh"
 5 | #include "util/probing_hash_table.hh"
 6 | #include "util/tokenize_piece.hh"
 7 | #include "util/file_stream.hh"
 8 | 
 9 | #include <cmath>
10 | #include <unordered_set>
11 | 
12 | struct Entry {
13 |   typedef uint64_t Key;
14 |   uint64_t hash;
15 | 
16 |   uint64_t GetKey() const { return hash; }
17 |   void SetKey(uint64_t to) { hash = to; }
18 | 
19 |   // Should be allocated from pool to ensure survival.
20 |   util::StringPiece str;
21 | 
22 |   uint64_t document_count;
23 | };
24 | 
25 | int main() {
26 |   uint64_t documents = 0;
27 |   util::Pool strings;
28 |   util::AutoProbing<Entry, util::IdentityHash> words;
29 |   Entry ent;
30 |   ent.document_count = 1;
31 |   for (util::StringPiece line : util::FilePiece(0)) {
32 |     ++documents;
33 |     std::unordered_set<uint64_t> seen_in_line;
34 |     for (util::TokenIter<util::BoolCharacter, true> it(line, util::kSpaces); it; ++it) {
35 |       ent.hash = util::MurmurHashNative(it->data(), it->size());
36 |       if (seen_in_line.insert(ent.hash).second) {
37 |         // Newly seen in this line.
38 |         util::AutoProbing<Entry, util::IdentityHash>::MutableIterator words_it;
39 |         if (words.FindOrInsert(ent, words_it)) {
40 |           ++(words_it->document_count);
41 |         } else {
42 |           char *data = static_cast<char*>(strings.Allocate(it->size()));
43 |           memcpy(data, it->data(), it->size());
44 |           words_it->str = util::StringPiece(data, it->size());
45 |         }
46 |       }
47 |     }
48 |   }
49 |   double documents_log = std::log(static_cast<double>(documents));
50 |   util::FileStream out(1);
51 |   for (util::AutoProbing<Entry, util::IdentityHash>::ConstIterator i = words.RawBegin(); i != words.RawEnd(); ++i) {
52 |     if (i->GetKey()) {
53 |       double count = static_cast<double>(i->document_count);
54 |       double idf = documents_log - std::log(count);
55 |       out << i->str << ' ' << idf << '\n';
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/preprocess/mmhsum_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/murmur_hash.hh"
 2 | 
 3 | #include <iostream>
 4 | #include <cstring>
 5 | #include <memory>
 6 | #include <vector>
 7 | 
 8 | int main(int argc, char *argv[]) {
 9 |   if (argc > 1) {
10 |     std::cerr << "Usage: [stdin] " << argv[0] << std::endl;
11 |     return 1;
12 |   }
13 |   
14 |   constexpr size_t bufferSize = 1024*1024;
15 |   std::vector<char> buffer(bufferSize);
16 |   uint64_t chained_hash = 0;
17 |   
18 |   while (std::cin)
19 |   {
20 |     std::cin.read(&buffer[0], bufferSize);
21 |     if(std::cin.bad()){
22 | 	    std::cerr << "Error trying to read from stdin\n";
23 | 	    return 1;
24 |     }
25 |     size_t count = std::cin.gcount();
26 |     if (!count)
27 |       break;
28 |     chained_hash = util::MurmurHashNative(&buffer[0], count, chained_hash);
29 |   }
30 |   std::cout << std::hex << chained_hash << '\n';
31 | }
32 | 


--------------------------------------------------------------------------------
/preprocess/order_independent_hash_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/murmur_hash.hh"
 2 | #include "util/file_piece.hh"
 3 | 
 4 | int main() {
 5 |   uint64_t sum = 0;
 6 |   for (util::StringPiece line : util::FilePiece(0)) {
 7 |     sum += util::MurmurHash64A(line.data(), line.size());
 8 |   }
 9 |   std::cout << sum << std::endl;
10 | }
11 | 


--------------------------------------------------------------------------------
/preprocess/parallel.hh:
--------------------------------------------------------------------------------
 1 | #ifndef PREPROCESS_PARALLEL__
 2 | #define PREPROCESS_PARALLEL__
 3 | 
 4 | #include "util/file_stream.hh"
 5 | #include "util/file_piece.hh"
 6 | 
 7 | #include <iostream>
 8 | #include <string>
 9 | #include <vector>
10 | 
11 | #include <stdint.h>
12 | 
13 | namespace preprocess {
14 | 
15 | template <class Pass, class... PassArguments> int FilterParallel(const std::vector<std::string> &files, PassArguments&&... pass_construct) {
16 |   uint64_t input = 0, output = 0;
17 |   if (files.empty()) {
18 |     Pass pass(std::forward<PassArguments>(pass_construct)...);
19 |     util::StringPiece line;
20 |     util::FilePiece in(0, NULL, &std::cerr);
21 |     util::FileStream out(1);
22 |     while (true) {
23 |       try {
24 |         line = in.ReadLine();
25 |       } catch (const util::EndOfFileException &e) { break; }
26 |       ++input;
27 |       if (pass(line)) {
28 |         out << line << '\n';
29 |         ++output;
30 |       }
31 |     }
32 |   } else if (files.size() == 4) {
33 |     Pass pass0(std::forward<PassArguments>(pass_construct)...), pass1(std::forward<PassArguments>(pass_construct)...);
34 |     util::StringPiece line0, line1;
35 |     util::FilePiece in0(files[0].c_str(), &std::cerr), in1(files[1].c_str());
36 |     util::FileStream out0(util::CreateOrThrow(files[2].c_str())), out1(util::CreateOrThrow(files[3].c_str()));
37 |     while (true) {
38 |       try {
39 |         line0 = in0.ReadLine();
40 |       } catch (const util::EndOfFileException &e) { break; }
41 |       line1 = in1.ReadLine();
42 |       ++input;
43 |       if (pass0(line0) && pass1(line1)) {
44 |         out0 << line0 << '\n';
45 |         out1 << line1 << '\n';
46 |         ++output;
47 |       }
48 |     }
49 |     try {
50 |       line1 = in1.ReadLine();
51 |       std::cerr << "Input is not balaced: " << files[1] << " has " << line1 << std::endl;
52 |       return 2;
53 |     } catch (const util::EndOfFileException &e) {}
54 |   } else {
55 |     std::cerr << 
56 |       "To filter from stdin to stdout, run without an argument.\n"
57 |       "To filter parallel files, run in0 in1 out0 out1\n";
58 |     return 1;
59 |   }
60 |   std::cerr << "Kept " << output << " / " << input << " = " << (static_cast<float>(output) / static_cast<float>(input)) << std::endl;
61 |   return 0;
62 | }
63 | 
64 | } // namespace preprocess
65 | #endif
66 | 


--------------------------------------------------------------------------------
/preprocess/process_unicode_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/utf8.hh"
 2 | #include "util/utf8_icu.hh"
 3 | 
 4 | #include <boost/program_options/options_description.hpp>
 5 | #include <boost/program_options/parsers.hpp>
 6 | #include <boost/program_options/variables_map.hpp>
 7 | 
 8 | #include <unicode/unistr.h>
 9 | #include <unicode/ustream.h>
10 | 
11 | #include <algorithm>
12 | #include <string>
13 | #include <iostream>
14 | 
15 | using U_ICU_NAMESPACE::UnicodeString;
16 | 
17 | namespace {
18 | struct Options {
19 |   std::string language;
20 |   bool lower;
21 |   bool flatten;
22 |   bool normalize;
23 | };
24 | void ParseArgs(int argc, char *argv[], Options &out) {
25 |   namespace po = boost::program_options;
26 |   po::options_description desc("Unicode treatment options");
27 |   desc.add_options()
28 |     ("language,l", po::value(&out.language)->default_value("en"), "Language (only applies to flatten)")
29 |     ("lower", po::bool_switch(&out.lower)->default_value(false), "Convert to lowercase")
30 |     ("flatten", po::bool_switch(&out.flatten)->default_value(false), "Canonicalize some characters for English")
31 |     ("normalize", po::bool_switch(&out.normalize)->default_value(false), "Normalize Unicode format");
32 |   po::variables_map vm;
33 |   po::store(po::parse_command_line(argc, argv, desc), vm);
34 |   po::notify(vm);
35 | }
36 | } // namespace
37 | 
38 | int main(int argc, char *argv[]) {
39 |   Options opt;
40 |   ParseArgs(argc, argv, opt);
41 |   util::Flatten flatten(opt.language);
42 |   std::string line, normalized;
43 |   UnicodeString str[2];
44 |   UnicodeString *cur = &str[0], *tmp = &str[1];
45 |   while (getline(std::cin, line)) {
46 |     *cur = UnicodeString::fromUTF8(line);
47 |     if (opt.lower) {
48 |       cur->toLower();
49 |     }
50 |     if (opt.flatten) {
51 |       flatten.Apply(*cur, *tmp);
52 |       std::swap(cur, tmp);
53 |     }
54 |     if (opt.normalize) {
55 |       util::Normalize(*cur, *tmp);
56 |       std::swap(cur, tmp);
57 |     }
58 |     std::cout << *str << '\n';
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/preprocess/remove_invalid_utf8_base64_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/file_stream.hh"
 2 | #include "util/file_piece.hh"
 3 | #include "util/utf8.hh"
 4 | 
 5 | #include "base64.hh"
 6 | 
 7 | int main() {
 8 |   util::FilePiece in(0);
 9 |   util::FileStream out(1);
10 |   util::StringPiece line;
11 |   std::string decoded;
12 |   std::string empty_base64;
13 |   preprocess::base64_encode("", empty_base64);
14 |   while (in.ReadLineOrEOF(line)) {
15 |     preprocess::base64_decode(line, decoded);
16 |     if (util::IsUTF8(decoded)) {
17 |       out << line << '\n';
18 |     } else {
19 |       out << empty_base64 << '\n';   
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/preprocess/remove_invalid_utf8_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/file_stream.hh"
 2 | #include "util/file_piece.hh"
 3 | #include "util/utf8.hh"
 4 | 
 5 | int main() {
 6 |   util::FilePiece in(0);
 7 |   util::FileStream out(1);
 8 |   util::StringPiece line;
 9 |   while (in.ReadLineOrEOF(line)) {
10 |     if (util::IsUTF8(line)) {
11 |       out << line << '\n';
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/preprocess/remove_long_lines_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/file_stream.hh"
 2 | #include "util/file_piece.hh"
 3 | 
 4 | #include <boost/lexical_cast.hpp>
 5 | #include <iostream>
 6 | 
 7 | #include <err.h>
 8 | 
 9 | int main(int argc, char *argv[]) {
10 |   std::size_t limit;
11 |   if (argc == 1) {
12 |     limit = 2000;
13 |   } else if (argc == 2) {
14 |     limit = boost::lexical_cast<std::size_t>(argv[1]);
15 |   } else {
16 |     std::cerr << "Usage: " << argv[0] << " [length limit in bytes]" << std::endl;
17 |     return 1;
18 |   }
19 |   util::FilePiece f(0, NULL, &std::cerr);
20 |   util::FileStream out(1);
21 |   try {
22 |     while (true) {
23 |       util::StringPiece l = f.ReadLine();
24 |       if (l.size() <= limit) {
25 |         out << l << '\n';
26 |       }
27 |     }
28 |   } catch (const util::EndOfFileException &e) {}
29 | }
30 | 


--------------------------------------------------------------------------------
/preprocess/resplit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -o pipefail
 3 | BINDIR="$(dirname "$0")"
 4 | #Argument 1 is language
 5 | l="$1"
 6 | if [ ${#l} == 0 ]; then
 7 |   echo "Argument is language" 1>&2
 8 |   exit 1
 9 | fi
10 | sed 's/^/<P>\n/' | $BINDIR/../moses/ems/support/split-sentences.perl -l $1 |fgrep -vx "<P>"
11 | 


--------------------------------------------------------------------------------
/preprocess/shard_main.cc:
--------------------------------------------------------------------------------
  1 | #include "preprocess/fields.hh"
  2 | #include "util/buffered_stream.hh"
  3 | #include "util/threaded_buffered_stream.hh"
  4 | #include "util/file_piece.hh"
  5 | #include "util/fixed_array.hh"
  6 | #include "util/murmur_hash.hh"
  7 | 
  8 | #include <sstream>
  9 | #include <iomanip>
 10 | 
 11 | #include <boost/program_options.hpp>
 12 | #include <boost/program_options/positional_options.hpp>
 13 | 
 14 | namespace preprocess {
 15 | 
 16 | struct Options {
 17 |   std::vector<FieldRange> key_fields;
 18 |   char delim;
 19 |   std::vector<std::string> outputs;
 20 |   util::WriteCompressed::Compression compression;
 21 | };
 22 | 
 23 | void ParseArgs(int argc, char *argv[], Options &out) {
 24 |   namespace po = boost::program_options;
 25 |   po::options_description desc("Arguments");
 26 |   std::string fields;
 27 |   std::string prefix;
 28 |   std::string compression_string;
 29 |   unsigned int number;
 30 | 
 31 |   desc.add_options()
 32 |     ("help,h", po::bool_switch(), "Show this help message")
 33 |     ("fields,f", po::value(&fields)->default_value("1-"), "Fields to use for key like cut -f")
 34 |     ("delim,d", po::value(&out.delim)->default_value('\t'), "Field delimiter")
 35 |     ("prefix,p", po::value(&prefix), "Prefix and count of outputs")
 36 |     ("number,n", po::value(&number), "Number of shards")
 37 |     ("output,o", po::value(&out.outputs)->multitoken(), "Output file names (or just list them without -o)")
 38 |     ("compress,c", po::value(&compression_string)->default_value("none"), "Compression.  One of none, gzip, or bzip2");
 39 | 
 40 |   po::positional_options_description pd;
 41 |   pd.add("output", -1);
 42 | 
 43 |   po::variables_map vm;
 44 |   po::store(po::command_line_parser(argc, argv).options(desc).positional(pd).run(), vm);
 45 |   if (argc == 1 || vm["help"].as<bool>()) {
 46 |     std::cerr << 
 47 |       "Shards stdin into multiple files by the hash of the key.\n" <<
 48 |       "Output is specified as --prefix prefix --number n or just listing file names.\n" <<
 49 |        desc <<
 50 |       "Examples:\n" <<
 51 |       argv[0] << " a b             #Shards stdin to files a and b using the whole line as key.\n" <<
 52 |       argv[0] << " a b c           #Shards stdin to files a, b, and c using the whole line as key.\n" <<
 53 |       argv[0] << " -f 1 a b        #Shards stdin to files a and b using tab-delimited field 1.\n" <<
 54 |       argv[0] << " -d ' ' -f 1 a b #Shards stdin to files a and b using space-delimited field 1." << std::endl;
 55 |     exit(1);
 56 |   }
 57 |   po::notify(vm);
 58 | 
 59 |   ParseFields(fields.c_str(), out.key_fields);
 60 |   DefragmentFields(out.key_fields);
 61 | 
 62 |   if (out.outputs.empty()) {
 63 |     UTIL_THROW_IF2(!vm.count("prefix"), "Specify outputs using --outputs or e.g. --prefix pre --number 2");
 64 |     UTIL_THROW_IF2(!vm.count("number"), "--prefix specified but we need to know how many shards with -n");
 65 |     // How many digits will be in the 0-indexed representation?
 66 |     unsigned int digits = 0;
 67 |     for (unsigned int compare = number - 1; compare; ++digits, compare /= 10) {}
 68 |     std::ostringstream stream;
 69 |     stream << std::setfill('0') << std::setw(digits);
 70 |     for (unsigned int i = 0; i < number; ++i) {
 71 |       stream << std::setw(digits) << i;
 72 |       out.outputs.push_back(prefix + stream.str());
 73 |       stream.str(std::string());
 74 |       stream.clear();
 75 |     }
 76 |   } else {
 77 |     UTIL_THROW_IF2(vm.count("prefix"), "Specify --prefix or --output");
 78 |     UTIL_THROW_IF2(vm.count("number") && number != out.outputs.size(), "Number of outputs does not match");
 79 |   }
 80 |   if (compression_string == "none") {
 81 |     out.compression = util::WriteCompressed::NONE;
 82 |   } else if (compression_string == "gzip") {
 83 |     out.compression = util::WriteCompressed::GZIP;
 84 |   } else if (compression_string == "bzip2") {
 85 |     out.compression = util::WriteCompressed::BZIP;
 86 |   } else {
 87 |     UTIL_THROW(util::Exception, "Unknown compression algorithm " << compression_string);
 88 |   }
 89 | }
 90 | 
 91 | } // namespace preprocess
 92 | 
 93 | int main(int argc, char *argv[]) {
 94 |   preprocess::Options options;
 95 |   preprocess::ParseArgs(argc, argv, options);
 96 |   uint64_t shard_count = options.outputs.size();
 97 | 
 98 |   util::FilePiece in(0);
 99 |   util::StringPiece line;
100 |   util::FixedArray<util::ThreadedBufferedStream<util::WriteCompressed> > out(options.outputs.size());
101 |   std::string output(argv[1]);
102 |   for (const std::string &o : options.outputs) {
103 |     out.push_back(util::CreateOrThrow(o.c_str()), options.compression);
104 |   }
105 |   while (in.ReadLineOrEOF(line)) {
106 |     preprocess::HashCallback cb;
107 |     preprocess::RangeFields(line, options.key_fields, options.delim, cb);
108 |     out[cb.Hash() % shard_count] << line << '\n';
109 |   }
110 |   return 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/preprocess/substitute_main.cc:
--------------------------------------------------------------------------------
 1 | #include "preprocess/fields.hh"
 2 | #include "util/file_stream.hh"
 3 | #include "util/file_piece.hh"
 4 | #include "util/murmur_hash.hh"
 5 | #include "util/pool.hh"
 6 | #include "util/probing_hash_table.hh"
 7 | #include <vector>
 8 | 
 9 | struct Entry {
10 |   typedef uint64_t Key;
11 |   Key key;
12 |   uint64_t GetKey() const { return key; }
13 |   void SetKey(uint64_t to) { key = to; }
14 |   util::StringPiece value;
15 | };
16 | 
17 | class RecordCallback {
18 |   public:
19 |     RecordCallback(util::StringPiece *to) : i_(to) {}
20 | 
21 |     void operator()(util::StringPiece str) {
22 |       *(i_++) = str;
23 |     }
24 | 
25 |     const util::StringPiece *Position() const { return i_; }
26 | 
27 |   private:
28 |     util::StringPiece *i_;
29 | };
30 | 
31 | int main() {
32 |   std::vector<preprocess::FieldRange> fields;
33 |   fields.resize(4);
34 |   util::StringPiece segments[4];
35 |   fields[0].begin = 0;
36 |   fields[0].end = 2;
37 |   util::StringPiece &sentences = segments[1];
38 |   fields[1].begin = 2;
39 |   fields[1].end = 4;
40 |   util::StringPiece &value = segments[2];
41 |   fields[2].begin = 4;
42 |   fields[2].end = 5;
43 |   util::StringPiece &after = segments[3];
44 |   fields[3].begin = 5;
45 |   fields[3].end = preprocess::FieldRange::kInfiniteEnd;
46 | 
47 |   util::Pool string_pool;
48 |   util::FileStream out(1);
49 | 
50 |   typedef util::AutoProbing<Entry, util::IdentityHash> Table;
51 |   Table table;
52 |   for (util::StringPiece line : util::FilePiece(0)) {
53 |     RecordCallback cb(segments);
54 |     preprocess::RangeFields(line, fields, '\t', cb);
55 |     UTIL_THROW_IF2(cb.Position() != segments + 4, "Did not get all fields in line " << line);
56 |     Entry entry;
57 |     entry.key = util::MurmurHashNative(sentences.data(), sentences.size());
58 |     Table::MutableIterator it;
59 |     if (table.FindOrInsert(entry, it)) {
60 |        out << util::StringPiece(line.data(), sentences.data() + sentences.size() - line.data());
61 |        out << '\t' << it->value << '\t';
62 |        out << after;
63 |     } else {
64 |       char *mem = static_cast<char*>(memcpy(string_pool.Allocate(value.size()), value.data(), value.size()));
65 |       it->value = util::StringPiece(mem, value.size());
66 |       out << line;
67 |     }
68 |     out << '\n';
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/preprocess/subtract_lines_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/file_piece.hh"
 2 | #include "util/file_stream.hh"
 3 | #include "util/murmur_hash.hh"
 4 | #include "util/probing_hash_table.hh"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | struct Entry {
 9 |   typedef uint64_t Key;
10 |   uint64_t key;
11 |   uint64_t GetKey() const { return key; }
12 |   void SetKey(uint64_t to) { key = to; }
13 | };
14 | 
15 | int main(int argc, char *argv[]) {
16 |   if (argc != 2) {
17 |     std::cerr << "Usage: " << argv[0] << " subtract <from >output\n"
18 |       "Copies from stdin to stdout, skipping lines that appear in `subtract`.\n"
19 |       "The subtraction is approximate, based on the hash of the line.\n"
20 |       "This is set subtraction.  All copies of a line are removed.\n";
21 |     return 1;
22 |   }
23 |   util::AutoProbing<Entry, util::IdentityHash> table;
24 |   // Load subtraction into table.
25 |   for (util::StringPiece line : util::FilePiece(argv[1])) {
26 |     Entry entry;
27 |     entry.key = util::MurmurHashNative(line.data(), line.size(), 1);
28 |     util::AutoProbing<Entry, util::IdentityHash>::MutableIterator it;
29 |     table.FindOrInsert(entry, it);
30 |   }
31 |   util::FileStream out(1);
32 |   for (util::StringPiece line : util::FilePiece(0)) {
33 |     uint64_t key = util::MurmurHashNative(line.data(), line.size(), 1);
34 |     util::AutoProbing<Entry, util::IdentityHash>::ConstIterator it;
35 |     if (!table.Find(key, it)) {
36 |       out << line << '\n';
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/preprocess/tests/cache/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . "$(dirname "$0")"/../vars
3 | diff <("$BIN"/cache cat <"$CUR"/input) "$CUR"/input
4 | diff <("$BIN"/cache -t " " -k 1 cat <"$CUR"/input) "$CUR"/space_expected
5 | 
6 | 


--------------------------------------------------------------------------------
/preprocess/tests/cache/space_ref.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | lines = {}
 4 | for l in sys.stdin:
 5 |   key = l[0:-1].split(' ')[0]
 6 |   if key in lines:
 7 |     sys.stdout.write(lines[key])
 8 |   else:
 9 |     lines[key] = l
10 |     sys.stdout.write(l)
11 | 


--------------------------------------------------------------------------------
/preprocess/tests/dedupe/columns:
--------------------------------------------------------------------------------
1 | 1 a
2 | 2 a
3 | 3 b
4 | 4 a
5 | 5 a
6 | 6 b
7 | 7 b
8 | 


--------------------------------------------------------------------------------
/preprocess/tests/dedupe/columns.out:
--------------------------------------------------------------------------------
1 | 1 a
2 | 3 b
3 | 


--------------------------------------------------------------------------------
/preprocess/tests/dedupe/ref.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | lines = set()
4 | for l in sys.stdin:
5 |   if l not in lines:
6 |     lines.add(l)
7 |     sys.stdout.write(l)
8 | 


--------------------------------------------------------------------------------
/preprocess/tests/dedupe/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . "$(dirname "$0")"/../vars
3 | diff <("$BIN/dedupe" <"$CUR/input") "$CUR/expected"
4 | "$BIN"/dedupe "$CUR"/input <(rev "$CUR"/input) "$TMP"/output0 "$TMP"/output1
5 | diff "$CUR"/expected "$TMP"/output0
6 | diff <(rev "$CUR"/expected) "$TMP"/output1
7 | rm "$TMP"/output0 "$TMP"/output1
8 | diff <("$BIN"/dedupe -f 2 -d " " <"$CUR"/columns) "$CUR"/columns.out
9 | 


--------------------------------------------------------------------------------
/preprocess/tests/foldfilter/input:
--------------------------------------------------------------------------------
1 | ../../../COPYING


--------------------------------------------------------------------------------
/preprocess/tests/foldfilter/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . "$(dirname "$0")"/../vars
 3 | #GPL has short columns
 4 | diff <("$BIN/foldfilter" cat <"$CUR"/input) "$CUR"/input
 5 | diff <("$BIN/foldfilter" -w 10 cat <"$CUR"/input) "$CUR"/input
 6 | "$BIN/foldfilter" -w 10 tee "$TMP/fold10" <"$CUR"/input >/dev/null
 7 | # Line breaks are not great with leading space but it does work
 8 | diff "$TMP/fold10" "$CUR/fold10.expected"
 9 | rm "$TMP/fold10"
10 | 


--------------------------------------------------------------------------------
/preprocess/tests/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CURRENT="$(dirname "$0")"
3 | set -eo pipefail
4 | for i in "$CURRENT"/*/; do
5 |   "${i}"run.sh || echo "FAILURE: ${i}" 1>&2
6 | done
7 | 


--------------------------------------------------------------------------------
/preprocess/tests/shard/input:
--------------------------------------------------------------------------------
1 | ../../../README.md


--------------------------------------------------------------------------------
/preprocess/tests/shard/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . "$(dirname "$0")"/../vars
 3 | "$BIN/shard" "$TMP"/test_a "$TMP"/test_b <"$CUR"/input
 4 | diff <(sort "$TMP"/test_a "$TMP"/test_b) <(sort "$CUR"/input)
 5 | "$BIN/shard" --prefix "$TMP"/test --number 4 <"$CUR"/input
 6 | diff <(sort "$TMP"/test{0,1,2,3}) <(sort "$CUR"/input)
 7 | "$BIN/shard" --prefix "$TMP"/test -c gzip --number 4 <"$CUR"/input
 8 | diff <(zcat "$TMP"/test{0,1,2,3} |sort) <(sort "$CUR"/input)
 9 | "$BIN/shard" --prefix "$TMP"/test -c bzip2 --number 4 <"$CUR"/input
10 | diff <(bzcat "$TMP"/test{0,1,2,3} |sort) <(sort "$CUR"/input)
11 | rm "$TMP"/test_a "$TMP"/test_b "$TMP"/test{0,1,2,3}
12 | 


--------------------------------------------------------------------------------
/preprocess/tests/vars:
--------------------------------------------------------------------------------
1 | set -eo pipefail
2 | CUR="$(dirname "$0")"
3 | BIN="${BIN:-"$CUR"/../../../build/bin}"
4 | TMP="$CUR"
5 | 


--------------------------------------------------------------------------------
/preprocess/text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -o pipefail
 3 | BINDIR="$(dirname "$0")"
 4 | #Argument 1 is language, argument 2 is lowercase (1) or not (0)
 5 | l="$1"
 6 | if [ ${#l} != 2 ]; then
 7 |   echo "Usage: \"$0 language lowercase\" where lowercase is 0 or 1." 1>&2
 8 |   exit 1
 9 | fi
10 | if [ "$2" != 1 ] && [ "$2" != 0 ]; then
11 |   echo "Second argument (lowercase) should be 0 or 1" 1>&2
12 |   exit 1
13 | fi
14 | #If statement hack to only run process unicode if lowercasing.
15 | "$BINDIR"/process_unicode --language $l --flatten --normalize |"$BINDIR"/../moses/tokenizer/tokenizer.perl -l $l | "$BINDIR"/heuristics.perl -l $l | if [ "$2" == 1 ]; then
16 |   "$BINDIR"/../moses/tokenizer/normalize-punctuation.perl $l | "$BINDIR"/process_unicode --language $l --lower
17 | else
18 |   "$BINDIR"/../moses/tokenizer/normalize-punctuation.perl $l
19 | fi
20 | 


--------------------------------------------------------------------------------
/preprocess/train_case_main.cc:
--------------------------------------------------------------------------------
  1 | #include "util/file_stream.hh"
  2 | #include "util/file_piece.hh"
  3 | #include "util/murmur_hash.hh"
  4 | #include "util/mutable_vocab.hh"
  5 | #include "util/tokenize_piece.hh"
  6 | #include "util/utf8.hh"
  7 | #include "util/utf8_icu.hh"
  8 | 
  9 | #include <unordered_map>
 10 | 
 11 | #include <boost/lexical_cast.hpp>
 12 | 
 13 | namespace {
 14 | void SplitLine(util::FilePiece &from, std::vector<util::StringPiece> &to) {
 15 |   to.clear();
 16 |   for (util::TokenIter<util::SingleCharacter, true> i(from.ReadLine(), ' '); i; ++i) {
 17 |     to.push_back(*i);
 18 |   }
 19 | }
 20 | 
 21 | class Recorder {
 22 |   public:
 23 |     void Add(util::StringPiece source, util::StringPiece target) {
 24 |       util::ToLower(target, lowered_);
 25 |       uint64_t key = util::MurmurHash64A(lowered_.data(), lowered_.size(), util::MurmurHash64A(source.data(), source.size()));
 26 |       ++map_[key][vocab_.FindOrInsert(target)];
 27 |     }
 28 | 
 29 |     void Dump() {
 30 |       util::FileStream out(1);
 31 |       for (Map::const_iterator i = map_.begin(); i != map_.end(); ++i) {
 32 |         out << boost::lexical_cast<std::string>(i->first);
 33 |         for (std::unordered_map<uint32_t, unsigned int>::const_iterator j = i->second.begin(); j != i->second.end(); ++j) {
 34 |           out << '\t' << vocab_.String(j->first) << ' ' << j->second;
 35 |         }
 36 |         out << '\n';
 37 |       }
 38 |     }
 39 | 
 40 |   private:
 41 |     util::MutableVocab vocab_;
 42 | 
 43 |     std::string lowered_;
 44 | 
 45 |     // map_[hash(lowered_target, hash(cased_source))][cased_target] = count(cased_source, cased_target)
 46 |     typedef std::unordered_map<uint64_t, std::unordered_map<uint32_t, unsigned int> > Map;
 47 |     Map map_;
 48 | };
 49 | 
 50 | } // namespace
 51 | 
 52 | int main(int argc, char *argv[]) {
 53 |   if (argc != 4) {
 54 |     std::cerr << "Usage: " << argv[0] << " alignment source target\n";
 55 |     return 1;
 56 |   }
 57 |   util::FilePiece align(argv[1], &std::cerr), source_file(argv[2]), target_file(argv[3]);
 58 |   std::vector<util::StringPiece> source_words, target_words;
 59 |   Recorder recorder;
 60 |   std::size_t sentence = 0, discarded = 0;
 61 |   for (; ; ++sentence) {
 62 |     try {
 63 |       SplitLine(source_file, source_words);
 64 |     } catch (const util::EndOfFileException &e) { break; }
 65 |     SplitLine(target_file, target_words);
 66 |     // parse comment lone
 67 |     // "# sentence pair (0) source length"
 68 |     for (unsigned int i = 0; i < 6; ++i) {
 69 |       align.ReadDelimited();
 70 |     }
 71 |     unsigned long from_length = align.ReadULong();
 72 |     align.ReadDelimited(); align.ReadDelimited(); // target length
 73 |     unsigned long to_length = align.ReadULong();
 74 |     align.ReadLine(); // comment line ending
 75 | 
 76 |     align.ReadLine(); // uncased sentence
 77 |     util::StringPiece word(align.ReadDelimited());
 78 |     UTIL_THROW_IF2("NULL" != word, "Expected NULL at the beginning, not " << word);
 79 | 
 80 |     if (from_length != source_words.size() || to_length != target_words.size()) {
 81 |       align.ReadLine(); // Complete line.
 82 |       ++discarded;
 83 |       continue;
 84 |     }
 85 | 
 86 |     while ("})" != align.ReadDelimited()) {}
 87 |     for (unsigned long from = 0; align.ReadWordSameLine(word); ++from) {
 88 |       align.ReadWordSameLine(word);
 89 |       UTIL_THROW_IF2(word != "({", "Expected ({ not " << word);
 90 |       UTIL_THROW_IF2(from >= source_words.size(), "Index " << from << " too high for source text at sentence " << sentence);
 91 |       for (align.SkipSpaces(); align.peek() != '}'; align.SkipSpaces()) {
 92 |         unsigned long to = align.ReadULong() - 1 /* NULL word */;
 93 |         UTIL_THROW_IF2(to >= target_words.size(), "Index " << to << " too high for target text");
 94 |         // Throw out beginning of sentence.
 95 |         if (from != 0 && to != 0) {
 96 |           recorder.Add(source_words[from], target_words[to]);
 97 |         }
 98 |       }
 99 |       UTIL_THROW_IF2(align.ReadDelimited() != "})", "Expected })");
100 |     }
101 |     align.ReadLine(); // Complete line.
102 |   }
103 |   std::cerr << "Discarded " << discarded << "/" << sentence << std::endl;
104 |   recorder.Dump();
105 | }
106 | 


--------------------------------------------------------------------------------
/preprocess/unescape_html.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | binmode(STDIN, ":utf8");
 3 | binmode(STDOUT, ":utf8");
 4 | 
 5 | use HTML::Entities;
 6 | use utf8;
 7 | 
 8 | while(<STDIN>) {
 9 |   $str = decode_entities($_);
10 |   $str =~ s/﻿/ /g;
11 |   print $str;
12 | }
13 | 


--------------------------------------------------------------------------------
/preprocess/vocab_main.cc:
--------------------------------------------------------------------------------
 1 | #include "util/file_piece.hh"
 2 | #include "util/file_stream.hh"
 3 | #include "util/murmur_hash.hh"
 4 | #include "util/probing_hash_table.hh"
 5 | 
 6 | #include <boost/unordered_set.hpp>
 7 | 
 8 | #include <iostream>
 9 | 
10 | #include <string.h>
11 | 
12 | struct Entry {
13 |   typedef uint64_t Key;
14 |   uint64_t key;
15 |   uint64_t GetKey() const { return key; }
16 |   void SetKey(uint64_t to) { key = to; }
17 | };
18 | 
19 | 
20 | int main() {
21 |   bool delimiters[256];
22 |   memset(delimiters, 0, sizeof(delimiters));
23 |   delimiters['\0'] = true;
24 |   delimiters['\t'] = true;
25 |   delimiters['\r'] = true;
26 |   delimiters['\n'] = true;
27 |   delimiters[' '] = true;
28 | 
29 |   util::AutoProbing<Entry, util::IdentityHash> seen;
30 | 
31 |   util::FilePiece in(0, "stdin", &std::cerr);
32 |   util::FileStream out(1);
33 | 
34 |   util::AutoProbing<Entry, util::IdentityHash>::MutableIterator it;
35 |   Entry entry;
36 | 
37 |   try { while (true) {
38 |     util::StringPiece word = in.ReadDelimited(delimiters);
39 |     entry.SetKey(util::MurmurHashNative(word.data(), word.size()));
40 |     if (!seen.FindOrInsert(entry, it)) {
41 |       out << word << '\0';
42 |     }
43 |   } } catch (const util::EndOfFileException &e) {}
44 | }
45 | 


--------------------------------------------------------------------------------
/preprocess/warc.cc:
--------------------------------------------------------------------------------
  1 | #include "preprocess/warc.hh"
  2 | 
  3 | #include "util/exception.hh"
  4 | #include "util/file.hh"
  5 | #include "util/compress.hh"
  6 | 
  7 | #include <cstdlib>
  8 | #include <limits>
  9 | #include <string>
 10 | #include <strings.h>
 11 | 
 12 | namespace preprocess {
 13 | 
 14 | bool ReadMore(util::ReadCompressed &reader, std::string &out) {
 15 |   const std::size_t kRead = 4096;
 16 |   std::size_t had = out.size();
 17 |   out.resize(out.size() + kRead);
 18 |   std::size_t got = reader.Read(&out[had], out.size() - had);
 19 |   if (!got) {
 20 |     // End of file
 21 |     UTIL_THROW_IF(had, util::EndOfFileException, "Unexpected end of file inside header");
 22 |     return false;
 23 |   }
 24 |   out.resize(had + got);
 25 |   return true;
 26 | }
 27 | 
 28 | class HeaderReader {
 29 |   public:
 30 |     HeaderReader(util::ReadCompressed &reader, std::string &out)
 31 |       : reader_(reader), out_(out), consumed_(0) {}
 32 | 
 33 |     bool Line(util::StringPiece &line) {
 34 |       std::size_t newline_start = consumed_;
 35 |       std::size_t newline;
 36 |       while (std::string::npos == (newline = out_.find('\n', newline_start))) {
 37 |         newline_start = out_.size();
 38 |         if (!ReadMore(reader_, out_)) return false;
 39 |       }
 40 |       // The line is [consumed, newline).  A blank line indicates header end.
 41 |       line = util::StringPiece(out_.data() + consumed_, newline - consumed_);
 42 |       // Remove carriage return if present.
 43 |       if (!line.empty() && line.data()[line.size() - 1] == '\r') {
 44 |         line = util::StringPiece(line.data(), line.size() - 1);
 45 |       }
 46 |       consumed_ = newline + 1;
 47 |       return true;
 48 |     }
 49 | 
 50 |     std::size_t Consumed() const { return consumed_; }
 51 | 
 52 |   private:
 53 |     util::ReadCompressed &reader_;
 54 |     std::string &out_;
 55 | 
 56 |     std::size_t consumed_;
 57 | };
 58 | 
 59 | bool WARCReader::Read(std::string &out) {
 60 |   std::swap(overhang_, out);
 61 |   overhang_.clear();
 62 |   out.reserve(32768);
 63 |   HeaderReader header(reader_, out);
 64 |   util::StringPiece line;
 65 |   if (!header.Line(line)) return false;
 66 |   UTIL_THROW_IF(line != "WARC/1.0", util::Exception, "Expected WARC/1.0 header but got `" << line << '\'');
 67 |   std::size_t length = 0;
 68 |   bool seen_content_length = false;
 69 |   const char kContentLength[] = "Content-Length:";
 70 |   const std::size_t kContentLengthLength = sizeof(kContentLength) - 1;
 71 |   while (!line.empty()) {
 72 |     UTIL_THROW_IF(!header.Line(line), util::EndOfFileException, "WARC ended in header.");
 73 |     if (line.size() >= kContentLengthLength && !strncasecmp(line.data(), kContentLength, kContentLengthLength)) {
 74 |       UTIL_THROW_IF2(seen_content_length, "Two Content-Length headers?");
 75 |       seen_content_length = true;
 76 |       char *end;
 77 |       length = std::strtoll(line.data() + kContentLengthLength, &end, 10);
 78 |       // TODO: tolerate whitespace?
 79 |       UTIL_THROW_IF2(end != line.data() + line.size(), "Content-Length parse error in `" << line << '\'');
 80 |     }
 81 |   }
 82 |   UTIL_THROW_IF2(!seen_content_length, "No Content-Length: header in " << out);
 83 |   std::size_t total_length = header.Consumed() + length + 4 /* CRLF CRLF after data as specified in the standard. */;
 84 | 
 85 |   if (total_length < out.size()) {
 86 |     overhang_.assign(out.data() + total_length, out.size() - total_length);
 87 |     out.resize(total_length);
 88 |   } else {
 89 |     std::size_t start = out.size();
 90 |     out.resize(total_length);
 91 |     while (start != out.size()) {
 92 |       std::size_t got = reader_.Read(&out[start], out.size() - start);
 93 |       UTIL_THROW_IF(!got, util::EndOfFileException, "Unexpected end of file while reading content of length " << length);
 94 |       start += got;
 95 |     }
 96 |   }
 97 |   // Check CRLF CRLF.
 98 |   UTIL_THROW_IF2(util::StringPiece(out.data() + out.size() - 4, 4) != util::StringPiece("\r\n\r\n", 4), "End of WARC record missing CRLF CRLF");
 99 |   return true;
100 | }
101 | 
102 | } // namespace preprocess
103 | 


--------------------------------------------------------------------------------
/preprocess/warc.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util/compress.hh"
 4 | 
 5 | #include <string>
 6 | 
 7 | namespace preprocess {
 8 | 
 9 | class WARCReader {
10 |   public:
11 |     explicit WARCReader(int fd) : reader_(fd) {}
12 | 
13 |     bool Read(std::string &out);
14 | 
15 |   private:
16 |     util::ReadCompressed reader_;
17 | 
18 |     std::string overhang_;
19 | };
20 | 
21 | } // namespace preprocess
22 | 


--------------------------------------------------------------------------------
/util/buffered_stream.hh:
--------------------------------------------------------------------------------
 1 | /* A buffered output stream.
 2 |  * The Writer class has this interface.
 3 |  * class Writer {
 4 |  *  private:
 5 |  *   void write(const void *data, size_t amount);
 6 |  *   void flush();
 7 |  * };
 8 |  */
 9 | #ifndef UTIL_BUFFERED_STREAM_H
10 | #define UTIL_BUFFERED_STREAM_H
11 | 
12 | #include "util/fake_ostream.hh"
13 | #include "util/file.hh"
14 | #include "util/scoped.hh"
15 | 
16 | #include <cassert>
17 | #include <cstring>
18 | 
19 | #include <stdint.h>
20 | 
21 | namespace util {
22 | 
23 | template <class Writer> class BufferedStream : public FakeOStream<BufferedStream<Writer> > {
24 |   public:
25 |     const std::size_t kBufferSize = std::max<size_t>(8192, kToStringMaxBytes);
26 |     template <typename... Args> explicit BufferedStream(Args&&... args)
27 |       : buf_(kBufferSize),
28 |         current_(static_cast<char*>(buf_.get())),
29 |         end_(current_ + kBufferSize),
30 |         writer_(std::forward<Args>(args)...) {}
31 | 
32 |     /* The source of the move is left in an unusable state that can only be destroyed. */
33 | #if __cplusplus >= 201103L
34 |     BufferedStream(BufferedStream &&from) noexcept : buf_(std::move(from.buf_)), current_(from.current_), end_(from.end_) {
35 |       from.end_ = reinterpret_cast<char*>(from.buf_.get());
36 |       from.current_ = from.end_;
37 |     }
38 | #endif
39 | 
40 |     ~BufferedStream() {
41 |       flush();
42 |     }
43 | 
44 |     BufferedStream<Writer> &flush() {
45 |       SpillBuffer();
46 |       writer_.flush();
47 |       return *this;
48 |     }
49 | 
50 |     // For writes of arbitrary size.
51 |     BufferedStream<Writer> &write(const void *data, std::size_t length) {
52 |       if (UTIL_LIKELY(current_ + length <= end_)) {
53 |         std::memcpy(current_, data, length);
54 |         current_ += length;
55 |         return *this;
56 |       }
57 |       SpillBuffer();
58 |       if (current_ + length <= end_) {
59 |         std::memcpy(current_, data, length);
60 |         current_ += length;
61 |       } else {
62 |         writer_.write(data, length);
63 |       }
64 |       return *this;
65 |     }
66 | 
67 |   private:
68 |     friend class FakeOStream<BufferedStream<Writer> >;
69 |     // For writes directly to buffer guaranteed to have amount < buffer size.
70 |     char *Ensure(std::size_t amount) {
71 |       if (UTIL_UNLIKELY(current_ + amount > end_)) {
72 |         SpillBuffer();
73 |         assert(current_ + amount <= end_);
74 |       }
75 |       return current_;
76 |     }
77 | 
78 |     void AdvanceTo(char *to) {
79 |       current_ = to;
80 |       assert(current_ <= end_);
81 |     }
82 | 
83 |     void SpillBuffer() {
84 |       if (current_ != buf_.get()) {
85 |         writer_.write(buf_.get(), current_ - (char*)buf_.get());
86 |         current_ = static_cast<char*>(buf_.get());
87 |       }
88 |     }
89 | 
90 |     util::scoped_malloc buf_;
91 |     char *current_, *end_;
92 |     Writer writer_;
93 | };
94 | 
95 | } // namespace util
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/util/cat_compressed_main.cc:
--------------------------------------------------------------------------------
 1 | // Like cat but interprets compressed files.
 2 | #include "util/file.hh"
 3 | #include "util/read_compressed.hh"
 4 | 
 5 | #include <cstring>
 6 | #include <iostream>
 7 | 
 8 | namespace {
 9 | const std::size_t kBufSize = 16384;
10 | void Copy(util::ReadCompressed &from, int to) {
11 |   util::scoped_malloc buffer(util::MallocOrThrow(kBufSize));
12 |   while (std::size_t amount = from.Read(buffer.get(), kBufSize)) {
13 |     util::WriteOrThrow(to, buffer.get(), amount);
14 |   }
15 | }
16 | } // namespace
17 | 
18 | int main(int argc, char *argv[]) {
19 |   // Lane Schwartz likes -h and --help
20 |   for (int i = 1; i < argc; ++i) {
21 |     char *arg = argv[i];
22 |     if (!strcmp(arg, "--")) break;
23 |     if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) {
24 |       std::cerr <<
25 |         "A cat implementation that interprets compressed files.\n"
26 |         "Usage: " << argv[0] << " [file1] [file2] ...\n"
27 |         "If no file is provided, then stdin is read.\n";
28 |       return 1;
29 |     }
30 |   }
31 | 
32 |   try {
33 |     if (argc == 1) {
34 |       util::ReadCompressed in(0);
35 |       Copy(in, 1);
36 |     } else {
37 |       for (int i = 1; i < argc; ++i) {
38 |         util::ReadCompressed in(util::OpenReadOrThrow(argv[i]));
39 |         Copy(in, 1);
40 |       }
41 |     }
42 |   } catch (const std::exception &e) {
43 |     std::cerr << e.what() << std::endl;
44 |     return 2;
45 |   }
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/util/compress.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UTIL_COMPRESS_H
  2 | #define UTIL_COMPRESS_H
  3 | 
  4 | #include "util/exception.hh"
  5 | #include "util/file.hh"
  6 | #include "util/scoped.hh"
  7 | 
  8 | #include <cstddef>
  9 | #include <stdint.h>
 10 | #include <string>
 11 | 
 12 | namespace util {
 13 | 
 14 | class CompressedException : public Exception {
 15 |   public:
 16 |     CompressedException() throw();
 17 |     virtual ~CompressedException() throw();
 18 | };
 19 | 
 20 | class GZException : public CompressedException {
 21 |   public:
 22 |     GZException() throw();
 23 |     ~GZException() throw();
 24 | };
 25 | 
 26 | class BZException : public CompressedException {
 27 |   public:
 28 |     BZException() throw();
 29 |     ~BZException() throw();
 30 | };
 31 | 
 32 | class XZException : public CompressedException {
 33 |   public:
 34 |     XZException() throw();
 35 |     ~XZException() throw();
 36 | };
 37 | 
 38 | class ReadCompressed;
 39 | 
 40 | class ReadBase {
 41 |   public:
 42 |     virtual ~ReadBase() {}
 43 | 
 44 |     virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0;
 45 | 
 46 |   protected:
 47 |     static void ReplaceThis(ReadBase *with, ReadCompressed &thunk);
 48 | 
 49 |     ReadBase *Current(ReadCompressed &thunk);
 50 | 
 51 |     static uint64_t &ReadCount(ReadCompressed &thunk);
 52 | };
 53 | 
 54 | class ReadCompressed {
 55 |   public:
 56 |     static const std::size_t kMagicSize = 6;
 57 |     // Must have at least kMagicSize bytes.
 58 |     static bool DetectCompressedMagic(const void *from);
 59 | 
 60 |     // Takes ownership of fd.
 61 |     explicit ReadCompressed(int fd);
 62 | 
 63 |     // Try to avoid using this.  Use the fd instead.
 64 |     // There is no decompression support for istreams.
 65 |     explicit ReadCompressed(std::istream &in);
 66 | 
 67 |     // Must call Reset later.
 68 |     ReadCompressed();
 69 | 
 70 |     // Takes ownership of fd.
 71 |     void Reset(int fd);
 72 | 
 73 |     // Same advice as the constructor.
 74 |     void Reset(std::istream &in);
 75 | 
 76 |     std::size_t Read(void *to, std::size_t amount);
 77 | 
 78 |     // Repeatedly call read to fill a buffer unless EOF is hit.
 79 |     // Return number of bytes read.
 80 |     std::size_t ReadOrEOF(void *const to, std::size_t amount);
 81 | 
 82 |     uint64_t RawAmount() const { return raw_amount_; }
 83 | 
 84 |   private:
 85 |     friend class ReadBase;
 86 | 
 87 |     scoped_ptr<ReadBase> internal_;
 88 | 
 89 |     uint64_t raw_amount_;
 90 | };
 91 | 
 92 | class WriteBase {
 93 |   public:
 94 |     virtual ~WriteBase();
 95 | 
 96 |     virtual void write(const void *data, std::size_t amount) = 0;
 97 | 
 98 |     virtual void flush() = 0;
 99 |  
100 |   protected:
101 |     WriteBase();
102 | };
103 | 
104 | /* Currently xzip is missing */
105 | class WriteCompressed {
106 |   public:
107 |     enum Compression { NONE, GZIP, BZIP, XZIP };
108 |     // Takes ownership of fd.
109 |     explicit WriteCompressed(int fd, Compression compression);
110 | 
111 |     ~WriteCompressed();
112 | 
113 |     void write(const void *data, std::size_t amount);
114 | 
115 |     void flush();
116 | 
117 |   private:
118 |     scoped_ptr<WriteBase> backend_;
119 | };
120 | 
121 | // Very basic gzip compression support.  Normally this would involve streams
122 | // but I needed the compression in the thread with fused output.
123 | void GZCompress(StringPiece from, std::string &to, int level = 9);
124 | 
125 | } // namespace util
126 | 
127 | #endif // UTIL_COMPRESS_H
128 | 


--------------------------------------------------------------------------------
/util/double-conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
 2 | 
 3 | # Explicitly list the source files for this subdirectory
 4 | #
 5 | # If you add any source files to this subdirectory
 6 | #    that should be included in the kenlm library,
 7 | #        (this excludes any unit test files)
 8 | #    you should add them to the following list:
 9 | #
10 | # In order to allow CMake files in the parent directory
11 | #    to see this variable definition, we set PARENT_SCOPE.
12 | #
13 | # In order to set correct paths to these files
14 | #    when this variable is referenced by CMake files in the parent directory,
15 | #    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
16 | #
17 | set(PREPROCESS_UTIL_DOUBLECONVERSION_SOURCE
18 | 		${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc
19 | 		${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc
20 | 		${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc
21 | 		${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc
22 | 		${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc
23 | 		${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc
24 | 		${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc
25 | 		${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc
26 | 	PARENT_SCOPE)
27 | 
28 | 


--------------------------------------------------------------------------------
/util/double-conversion/Jamfile:
--------------------------------------------------------------------------------
1 | fakelib double-conversion : [ glob *.cc ] : : : <include>. ;
2 | 


--------------------------------------------------------------------------------
/util/double-conversion/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2006-2011, the V8 project authors. All rights reserved.
 2 | Redistribution and use in source and binary forms, with or without
 3 | modification, are permitted provided that the following conditions are
 4 | met:
 5 | 
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above
 9 |       copyright notice, this list of conditions and the following
10 |       disclaimer in the documentation and/or other materials provided
11 |       with the distribution.
12 |     * Neither the name of Google Inc. nor the names of its
13 |       contributors may be used to endorse or promote products derived
14 |       from this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/util/double-conversion/bignum-dtoa.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | #ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_
29 | #define DOUBLE_CONVERSION_BIGNUM_DTOA_H_
30 | 
31 | #include "utils.h"
32 | 
33 | namespace double_conversion {
34 | 
35 | enum BignumDtoaMode {
36 |   // Return the shortest correct representation.
37 |   // For example the output of 0.299999999999999988897 is (the less accurate but
38 |   // correct) 0.3.
39 |   BIGNUM_DTOA_SHORTEST,
40 |   // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats.
41 |   BIGNUM_DTOA_SHORTEST_SINGLE,
42 |   // Return a fixed number of digits after the decimal point.
43 |   // For instance fixed(0.1, 4) becomes 0.1000
44 |   // If the input number is big, the output will be big.
45 |   BIGNUM_DTOA_FIXED,
46 |   // Return a fixed number of digits, no matter what the exponent is.
47 |   BIGNUM_DTOA_PRECISION
48 | };
49 | 
50 | // Converts the given double 'v' to ascii.
51 | // The result should be interpreted as buffer * 10^(point-length).
52 | // The buffer will be null-terminated.
53 | //
54 | // The input v must be > 0 and different from NaN, and Infinity.
55 | //
56 | // The output depends on the given mode:
57 | //  - SHORTEST: produce the least amount of digits for which the internal
58 | //   identity requirement is still satisfied. If the digits are printed
59 | //   (together with the correct exponent) then reading this number will give
60 | //   'v' again. The buffer will choose the representation that is closest to
61 | //   'v'. If there are two at the same distance, than the number is round up.
62 | //   In this mode the 'requested_digits' parameter is ignored.
63 | //  - FIXED: produces digits necessary to print a given number with
64 | //   'requested_digits' digits after the decimal point. The produced digits
65 | //   might be too short in which case the caller has to fill the gaps with '0's.
66 | //   Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
67 | //   Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns
68 | //     buffer="2", point=0.
69 | //   Note: the length of the returned buffer has no meaning wrt the significance
70 | //   of its digits. That is, just because it contains '0's does not mean that
71 | //   any other digit would not satisfy the internal identity requirement.
72 | //  - PRECISION: produces 'requested_digits' where the first digit is not '0'.
73 | //   Even though the length of produced digits usually equals
74 | //   'requested_digits', the function is allowed to return fewer digits, in
75 | //   which case the caller has to fill the missing digits with '0's.
76 | //   Halfway cases are again rounded up.
77 | // 'BignumDtoa' expects the given buffer to be big enough to hold all digits
78 | // and a terminating null-character.
79 | void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits,
80 |                 Vector<char> buffer, int* length, int* point);
81 | 
82 | }  // namespace double_conversion
83 | 
84 | #endif  // DOUBLE_CONVERSION_BIGNUM_DTOA_H_
85 | 


--------------------------------------------------------------------------------
/util/double-conversion/cached-powers.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | #ifndef DOUBLE_CONVERSION_CACHED_POWERS_H_
29 | #define DOUBLE_CONVERSION_CACHED_POWERS_H_
30 | 
31 | #include "diy-fp.h"
32 | 
33 | namespace double_conversion {
34 | 
35 | class PowersOfTenCache {
36 |  public:
37 | 
38 |   // Not all powers of ten are cached. The decimal exponent of two neighboring
39 |   // cached numbers will differ by kDecimalExponentDistance.
40 |   static const int kDecimalExponentDistance;
41 | 
42 |   static const int kMinDecimalExponent;
43 |   static const int kMaxDecimalExponent;
44 | 
45 |   // Returns a cached power-of-ten with a binary exponent in the range
46 |   // [min_exponent; max_exponent] (boundaries included).
47 |   static void GetCachedPowerForBinaryExponentRange(int min_exponent,
48 |                                                    int max_exponent,
49 |                                                    DiyFp* power,
50 |                                                    int* decimal_exponent);
51 | 
52 |   // Returns a cached power of ten x ~= 10^k such that
53 |   //   k <= decimal_exponent < k + kCachedPowersDecimalDistance.
54 |   // The given decimal_exponent must satisfy
55 |   //   kMinDecimalExponent <= requested_exponent, and
56 |   //   requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
57 |   static void GetCachedPowerForDecimalExponent(int requested_exponent,
58 |                                                DiyFp* power,
59 |                                                int* found_exponent);
60 | };
61 | 
62 | }  // namespace double_conversion
63 | 
64 | #endif  // DOUBLE_CONVERSION_CACHED_POWERS_H_
65 | 


--------------------------------------------------------------------------------
/util/double-conversion/diy-fp.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | 
29 | #include "diy-fp.h"
30 | #include "utils.h"
31 | 
32 | namespace double_conversion {
33 | 
34 | void DiyFp::Multiply(const DiyFp& other) {
35 |   // Simply "emulates" a 128 bit multiplication.
36 |   // However: the resulting number only contains 64 bits. The least
37 |   // significant 64 bits are only used for rounding the most significant 64
38 |   // bits.
39 |   const uint64_t kM32 = 0xFFFFFFFFU;
40 |   uint64_t a = f_ >> 32;
41 |   uint64_t b = f_ & kM32;
42 |   uint64_t c = other.f_ >> 32;
43 |   uint64_t d = other.f_ & kM32;
44 |   uint64_t ac = a * c;
45 |   uint64_t bc = b * c;
46 |   uint64_t ad = a * d;
47 |   uint64_t bd = b * d;
48 |   uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32);
49 |   // By adding 1U << 31 to tmp we round the final result.
50 |   // Halfway cases will be round up.
51 |   tmp += 1U << 31;
52 |   uint64_t result_f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32);
53 |   e_ += other.e_ + 64;
54 |   f_ = result_f;
55 | }
56 | 
57 | }  // namespace double_conversion
58 | 


--------------------------------------------------------------------------------
/util/double-conversion/diy-fp.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2010 the V8 project authors. All rights reserved.
  2 | // Redistribution and use in source and binary forms, with or without
  3 | // modification, are permitted provided that the following conditions are
  4 | // met:
  5 | //
  6 | //     * Redistributions of source code must retain the above copyright
  7 | //       notice, this list of conditions and the following disclaimer.
  8 | //     * Redistributions in binary form must reproduce the above
  9 | //       copyright notice, this list of conditions and the following
 10 | //       disclaimer in the documentation and/or other materials provided
 11 | //       with the distribution.
 12 | //     * Neither the name of Google Inc. nor the names of its
 13 | //       contributors may be used to endorse or promote products derived
 14 | //       from this software without specific prior written permission.
 15 | //
 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | #ifndef DOUBLE_CONVERSION_DIY_FP_H_
 29 | #define DOUBLE_CONVERSION_DIY_FP_H_
 30 | 
 31 | #include "utils.h"
 32 | 
 33 | namespace double_conversion {
 34 | 
 35 | // This "Do It Yourself Floating Point" class implements a floating-point number
 36 | // with a uint64 significand and an int exponent. Normalized DiyFp numbers will
 37 | // have the most significant bit of the significand set.
 38 | // Multiplication and Subtraction do not normalize their results.
 39 | // DiyFp are not designed to contain special doubles (NaN and Infinity).
 40 | class DiyFp {
 41 |  public:
 42 |   static const int kSignificandSize = 64;
 43 | 
 44 |   DiyFp() : f_(0), e_(0) {}
 45 |   DiyFp(uint64_t f, int e) : f_(f), e_(e) {}
 46 | 
 47 |   // this = this - other.
 48 |   // The exponents of both numbers must be the same and the significand of this
 49 |   // must be bigger than the significand of other.
 50 |   // The result will not be normalized.
 51 |   void Subtract(const DiyFp& other) {
 52 |     ASSERT(e_ == other.e_);
 53 |     ASSERT(f_ >= other.f_);
 54 |     f_ -= other.f_;
 55 |   }
 56 | 
 57 |   // Returns a - b.
 58 |   // The exponents of both numbers must be the same and this must be bigger
 59 |   // than other. The result will not be normalized.
 60 |   static DiyFp Minus(const DiyFp& a, const DiyFp& b) {
 61 |     DiyFp result = a;
 62 |     result.Subtract(b);
 63 |     return result;
 64 |   }
 65 | 
 66 | 
 67 |   // this = this * other.
 68 |   void Multiply(const DiyFp& other);
 69 | 
 70 |   // returns a * b;
 71 |   static DiyFp Times(const DiyFp& a, const DiyFp& b) {
 72 |     DiyFp result = a;
 73 |     result.Multiply(b);
 74 |     return result;
 75 |   }
 76 | 
 77 |   void Normalize() {
 78 |     ASSERT(f_ != 0);
 79 |     uint64_t f = f_;
 80 |     int e = e_;
 81 | 
 82 |     // This method is mainly called for normalizing boundaries. In general
 83 |     // boundaries need to be shifted by 10 bits. We thus optimize for this case.
 84 |     const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000);
 85 |     while ((f & k10MSBits) == 0) {
 86 |       f <<= 10;
 87 |       e -= 10;
 88 |     }
 89 |     while ((f & kUint64MSB) == 0) {
 90 |       f <<= 1;
 91 |       e--;
 92 |     }
 93 |     f_ = f;
 94 |     e_ = e;
 95 |   }
 96 | 
 97 |   static DiyFp Normalize(const DiyFp& a) {
 98 |     DiyFp result = a;
 99 |     result.Normalize();
100 |     return result;
101 |   }
102 | 
103 |   uint64_t f() const { return f_; }
104 |   int e() const { return e_; }
105 | 
106 |   void set_f(uint64_t new_value) { f_ = new_value; }
107 |   void set_e(int new_value) { e_ = new_value; }
108 | 
109 |  private:
110 |   static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);
111 | 
112 |   uint64_t f_;
113 |   int e_;
114 | };
115 | 
116 | }  // namespace double_conversion
117 | 
118 | #endif  // DOUBLE_CONVERSION_DIY_FP_H_
119 | 


--------------------------------------------------------------------------------
/util/double-conversion/fast-dtoa.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | #ifndef DOUBLE_CONVERSION_FAST_DTOA_H_
29 | #define DOUBLE_CONVERSION_FAST_DTOA_H_
30 | 
31 | #include "utils.h"
32 | 
33 | namespace double_conversion {
34 | 
35 | enum FastDtoaMode {
36 |   // Computes the shortest representation of the given input. The returned
37 |   // result will be the most accurate number of this length. Longer
38 |   // representations might be more accurate.
39 |   FAST_DTOA_SHORTEST,
40 |   // Same as FAST_DTOA_SHORTEST but for single-precision floats.
41 |   FAST_DTOA_SHORTEST_SINGLE,
42 |   // Computes a representation where the precision (number of digits) is
43 |   // given as input. The precision is independent of the decimal point.
44 |   FAST_DTOA_PRECISION
45 | };
46 | 
47 | // FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not
48 | // include the terminating '\0' character.
49 | static const int kFastDtoaMaximalLength = 17;
50 | // Same for single-precision numbers.
51 | static const int kFastDtoaMaximalSingleLength = 9;
52 | 
53 | // Provides a decimal representation of v.
54 | // The result should be interpreted as buffer * 10^(point - length).
55 | //
56 | // Precondition:
57 | //   * v must be a strictly positive finite double.
58 | //
59 | // Returns true if it succeeds, otherwise the result can not be trusted.
60 | // There will be *length digits inside the buffer followed by a null terminator.
61 | // If the function returns true and mode equals
62 | //   - FAST_DTOA_SHORTEST, then
63 | //     the parameter requested_digits is ignored.
64 | //     The result satisfies
65 | //         v == (double) (buffer * 10^(point - length)).
66 | //     The digits in the buffer are the shortest representation possible. E.g.
67 | //     if 0.099999999999 and 0.1 represent the same double then "1" is returned
68 | //     with point = 0.
69 | //     The last digit will be closest to the actual v. That is, even if several
70 | //     digits might correctly yield 'v' when read again, the buffer will contain
71 | //     the one closest to v.
72 | //   - FAST_DTOA_PRECISION, then
73 | //     the buffer contains requested_digits digits.
74 | //     the difference v - (buffer * 10^(point-length)) is closest to zero for
75 | //     all possible representations of requested_digits digits.
76 | //     If there are two values that are equally close, then FastDtoa returns
77 | //     false.
78 | // For both modes the buffer must be large enough to hold the result.
79 | bool FastDtoa(double d,
80 |               FastDtoaMode mode,
81 |               int requested_digits,
82 |               Vector<char> buffer,
83 |               int* length,
84 |               int* decimal_point);
85 | 
86 | }  // namespace double_conversion
87 | 
88 | #endif  // DOUBLE_CONVERSION_FAST_DTOA_H_
89 | 


--------------------------------------------------------------------------------
/util/double-conversion/fixed-dtoa.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | #ifndef DOUBLE_CONVERSION_FIXED_DTOA_H_
29 | #define DOUBLE_CONVERSION_FIXED_DTOA_H_
30 | 
31 | #include "utils.h"
32 | 
33 | namespace double_conversion {
34 | 
35 | // Produces digits necessary to print a given number with
36 | // 'fractional_count' digits after the decimal point.
37 | // The buffer must be big enough to hold the result plus one terminating null
38 | // character.
39 | //
40 | // The produced digits might be too short in which case the caller has to fill
41 | // the gaps with '0's.
42 | // Example: FastFixedDtoa(0.001, 5, ...) is allowed to return buffer = "1", and
43 | // decimal_point = -2.
44 | // Halfway cases are rounded towards +/-Infinity (away from 0). The call
45 | // FastFixedDtoa(0.15, 2, ...) thus returns buffer = "2", decimal_point = 0.
46 | // The returned buffer may contain digits that would be truncated from the
47 | // shortest representation of the input.
48 | //
49 | // This method only works for some parameters. If it can't handle the input it
50 | // returns false. The output is null-terminated when the function succeeds.
51 | bool FastFixedDtoa(double v, int fractional_count,
52 |                    Vector<char> buffer, int* length, int* decimal_point);
53 | 
54 | }  // namespace double_conversion
55 | 
56 | #endif  // DOUBLE_CONVERSION_FIXED_DTOA_H_
57 | 


--------------------------------------------------------------------------------
/util/double-conversion/strtod.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | #ifndef DOUBLE_CONVERSION_STRTOD_H_
29 | #define DOUBLE_CONVERSION_STRTOD_H_
30 | 
31 | #include "utils.h"
32 | 
33 | namespace double_conversion {
34 | 
35 | // The buffer must only contain digits in the range [0-9]. It must not
36 | // contain a dot or a sign. It must not start with '0', and must not be empty.
37 | double Strtod(Vector<const char> buffer, int exponent);
38 | 
39 | // The buffer must only contain digits in the range [0-9]. It must not
40 | // contain a dot or a sign. It must not start with '0', and must not be empty.
41 | float Strtof(Vector<const char> buffer, int exponent);
42 | 
43 | }  // namespace double_conversion
44 | 
45 | #endif  // DOUBLE_CONVERSION_STRTOD_H_
46 | 


--------------------------------------------------------------------------------
/util/ersatz_progress.cc:
--------------------------------------------------------------------------------
 1 | #include "util/ersatz_progress.hh"
 2 | 
 3 | #include <algorithm>
 4 | #include <ostream>
 5 | #include <limits>
 6 | #include <string>
 7 | 
 8 | namespace util {
 9 | 
10 | namespace { const unsigned char kWidth = 100; }
11 | 
12 | const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
13 | 
14 | ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<uint64_t>::max()), complete_(next_), out_(NULL) {}
15 | 
16 | ErsatzProgress::~ErsatzProgress() {
17 |   if (out_) Finished();
18 | }
19 | 
20 | ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
21 |   : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
22 |   if (!out_) {
23 |     next_ = std::numeric_limits<uint64_t>::max();
24 |     return;
25 |   }
26 |   if (!message.empty()) *out_ << message << '\n';
27 |   *out_ << kProgressBanner;
28 | }
29 | 
30 | void ErsatzProgress::Milestone() {
31 |   if (!out_) { current_ = 0; return; }
32 |   if (!complete_) return;
33 |   unsigned char stone = std::min(static_cast<uint64_t>(kWidth), (current_ * kWidth) / complete_);
34 | 
35 |   for (; stones_written_ < stone; ++stones_written_) {
36 |     (*out_) << '*';
37 |   }
38 |   if (stone == kWidth) {
39 |     (*out_) << std::endl;
40 |     next_ = std::numeric_limits<uint64_t>::max();
41 |     out_ = NULL;
42 |   } else {
43 |     next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
44 |   }
45 | }
46 | 
47 | } // namespace util
48 | 


--------------------------------------------------------------------------------
/util/ersatz_progress.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_ERSATZ_PROGRESS_H
 2 | #define UTIL_ERSATZ_PROGRESS_H
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | #include <stdint.h>
 7 | 
 8 | // Ersatz version of boost::progress so core language model doesn't depend on
 9 | // boost.  Also adds option to print nothing.
10 | 
11 | namespace util {
12 | 
13 | extern const char kProgressBanner[];
14 | 
15 | class ErsatzProgress {
16 |   public:
17 |     // No output.
18 |     ErsatzProgress();
19 | 
20 |     // Null means no output.  The null value is useful for passing along the ostream pointer from another caller.
21 |     explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
22 | 
23 | #if __cplusplus >= 201103L
24 |     ErsatzProgress(ErsatzProgress &&from) noexcept : current_(from.current_), next_(from.next_), complete_(from.complete_), stones_written_(from.stones_written_), out_(from.out_) {
25 |       from.out_ = nullptr;
26 |       from.next_ = (uint64_t)-1;
27 |     }
28 | #endif
29 | 
30 |     ~ErsatzProgress();
31 | 
32 |     ErsatzProgress &operator++() {
33 |       if (++current_ >= next_) Milestone();
34 |       return *this;
35 |     }
36 | 
37 |     ErsatzProgress &operator+=(uint64_t amount) {
38 |       if ((current_ += amount) >= next_) Milestone();
39 |       return *this;
40 |     }
41 | 
42 |     void Set(uint64_t to) {
43 |       if ((current_ = to) >= next_) Milestone();
44 |     }
45 | 
46 |     void Finished() {
47 |       Set(complete_);
48 |     }
49 | 
50 |   private:
51 |     void Milestone();
52 | 
53 |     uint64_t current_, next_, complete_;
54 |     unsigned char stones_written_;
55 |     std::ostream *out_;
56 | 
57 |     // noncopyable
58 |     ErsatzProgress(const ErsatzProgress &other);
59 |     ErsatzProgress &operator=(const ErsatzProgress &other);
60 | };
61 | 
62 | } // namespace util
63 | 
64 | #endif // UTIL_ERSATZ_PROGRESS_H
65 | 


--------------------------------------------------------------------------------
/util/exception.cc:
--------------------------------------------------------------------------------
  1 | #include "util/exception.hh"
  2 | 
  3 | #ifdef __GXX_RTTI
  4 | #include <typeinfo>
  5 | #endif
  6 | 
  7 | #include <cerrno>
  8 | #include <cstring>
  9 | 
 10 | #if defined(_WIN32) || defined(_WIN64)
 11 | #include <windows.h>
 12 | #include <io.h>
 13 | #endif
 14 | 
 15 | namespace util {
 16 | 
 17 | Exception::Exception() throw() {}
 18 | Exception::~Exception() throw() {}
 19 | 
 20 | void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
 21 |   /* The child class might have set some text, but we want this to come first.
 22 |    * Another option would be passing this information to the constructor, but
 23 |    * then child classes would have to accept constructor arguments and pass
 24 |    * them down.
 25 |    */
 26 |   std::string old_text;
 27 |   what_.swap(old_text);
 28 |   what_ << file << ':' << line;
 29 |   if (func) what_ << " in " << func << " threw ";
 30 |   if (child_name) {
 31 |     what_ << child_name;
 32 |   } else {
 33 | #ifdef __GXX_RTTI
 34 |     what_ << typeid(this).name();
 35 | #else
 36 |     what_ << "an exception";
 37 | #endif
 38 |   }
 39 |   if (condition) {
 40 |     what_ << " because `" << condition << '\'';
 41 |   }
 42 |   what_ << ".\n";
 43 |   what_ << old_text;
 44 | }
 45 | 
 46 | namespace {
 47 | 
 48 | #ifdef __GNUC__
 49 | const char *HandleStrerror(int ret, const char *buf) __attribute__ ((unused));
 50 | const char *HandleStrerror(const char *ret, const char * /*buf*/) __attribute__ ((unused));
 51 | #endif
 52 | // At least one of these functions will not be called.
 53 | #ifdef __clang__
 54 | #pragma clang diagnostic push
 55 | #pragma clang diagnostic ignored "-Wunused-function"
 56 | #endif
 57 | // The XOPEN version.
 58 | const char *HandleStrerror(int ret, const char *buf) {
 59 |   if (!ret) return buf;
 60 |   return NULL;
 61 | }
 62 | 
 63 | // The GNU version.
 64 | const char *HandleStrerror(const char *ret, const char * /*buf*/) {
 65 |   return ret;
 66 | }
 67 | #ifdef __clang__
 68 | #pragma clang diagnostic pop
 69 | #endif
 70 | } // namespace
 71 | 
 72 | ErrnoException::ErrnoException() throw() : errno_(errno) {
 73 |   char buf[200];
 74 |   buf[0] = 0;
 75 | #if defined(sun) || defined(_WIN32) || defined(_WIN64)
 76 |   const char *add = strerror(errno);
 77 | #else
 78 |   const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
 79 | #endif
 80 | 
 81 |   if (add) {
 82 |     *this << add << ' ';
 83 |   }
 84 | }
 85 | 
 86 | ErrnoException::~ErrnoException() throw() {}
 87 | 
 88 | OverflowException::OverflowException() throw() {}
 89 | OverflowException::~OverflowException() throw() {}
 90 | 
 91 | #if defined(_WIN32) || defined(_WIN64)
 92 | WindowsException::WindowsException() throw() {
 93 |   unsigned int last_error = GetLastError();
 94 |   char error_msg[256] = "";
 95 |   if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) {
 96 |     *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". ";
 97 |   } else {
 98 |     *this << "Windows error " << last_error << ": " << error_msg;
 99 |   }
100 | }
101 | WindowsException::~WindowsException() throw() {}
102 | #endif
103 | 
104 | } // namespace util
105 | 


--------------------------------------------------------------------------------
/util/fake_ostream.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UTIL_FAKE_OSTREAM_H
  2 | #define UTIL_FAKE_OSTREAM_H
  3 | 
  4 | #include "util/float_to_string.hh"
  5 | #include "util/integer_to_string.hh"
  6 | #include "util/string_piece.hh"
  7 | 
  8 | #include <cassert>
  9 | #include <limits>
 10 | 
 11 | #include <stdint.h>
 12 | 
 13 | namespace util {
 14 | 
 15 | /* Like std::ostream but without being incredibly slow.
 16 |  * Supports most of the built-in types except for long double.
 17 |  *
 18 |  * The FakeOStream class is intended to be inherited from.  The inherting class
 19 |  * should provide:
 20 |  * public:
 21 |  *   Derived &flush();
 22 |  *   Derived &write(const void *data, std::size_t length);
 23 |  *
 24 |  * private: or protected:
 25 |  *   friend class FakeOStream;
 26 |  *   char *Ensure(std::size_t amount);
 27 |  *   void AdvanceTo(char *to);
 28 |  *
 29 |  * The Ensure function makes enough space for an in-place write and returns
 30 |  * where to write.  The AdvanceTo function happens after the write, saying how
 31 |  * much was actually written.
 32 |  *
 33 |  * Precondition:
 34 |  * amount <= kToStringMaxBytes for in-place writes.
 35 |  */
 36 | template <class Derived> class FakeOStream {
 37 |   public:
 38 |     FakeOStream() {}
 39 | 
 40 |     // This also covers std::string and char*
 41 |     Derived &operator<<(StringPiece str) {
 42 |       return C().write(str.data(), str.size());
 43 |     }
 44 | 
 45 |     // Handle integers by size and signedness.
 46 |   private:
 47 |     template <class Arg> struct EnableIfKludge {
 48 |       typedef Derived type;
 49 |     };
 50 |     template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {};
 51 | 
 52 |     template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; };
 53 |     template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; };
 54 |     template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; };
 55 | 
 56 |     template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; };
 57 |     template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; };
 58 |     template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; };
 59 |   public:
 60 |     template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) {
 61 |       return CallToString(static_cast<typename Coerce<From>::To>(value));
 62 |     }
 63 | 
 64 |     // Character types that get copied as bytes instead of displayed as integers.
 65 |     Derived &operator<<(char val) { return put(val); }
 66 |     Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
 67 |     Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
 68 | 
 69 |     Derived &operator<<(bool val) { return put(val + '0'); }
 70 |     // enums will fall back to int but are not caught by the template.
 71 |     Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); }
 72 | 
 73 |     Derived &operator<<(float val) { return CallToString(val); }
 74 |     Derived &operator<<(double val) { return CallToString(val); }
 75 | 
 76 |     // This is here to catch all the other pointer types.
 77 |     Derived &operator<<(const void *value) { return CallToString(value); }
 78 |     // This is here because the above line also catches const char*.
 79 |     Derived &operator<<(const char *value) { return *this << StringPiece(value); }
 80 |     Derived &operator<<(char *value) { return *this << StringPiece(value); }
 81 | 
 82 |     Derived &put(char val) {
 83 |       char *c = C().Ensure(1);
 84 |       *c = val;
 85 |       C().AdvanceTo(++c);
 86 |       return C();
 87 |     }
 88 | 
 89 |     char widen(char val) const { return val; }
 90 | 
 91 |   private:
 92 |     // References to derived class for convenience.
 93 |     Derived &C() {
 94 |       return *static_cast<Derived*>(this);
 95 |     }
 96 | 
 97 |     const Derived &C() const {
 98 |       return *static_cast<const Derived*>(this);
 99 |     }
100 | 
101 |     // This is separate to prevent an infinite loop if the compiler considers
102 |     // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
103 |     template <class T> Derived &CallToString(const T value) {
104 |       C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
105 |       return C();
106 |     }
107 | };
108 | 
109 | } // namespace
110 | 
111 | #endif // UTIL_FAKE_OSTREAM_H
112 | 


--------------------------------------------------------------------------------
/util/file_stream.hh:
--------------------------------------------------------------------------------
 1 | /* Like std::ofstream but without being incredibly slow.  Backed by a raw fd that it owns.
 2 |  * Supports most of the built-in types except for long double.
 3 |  */
 4 | #ifndef UTIL_FILE_STREAM_H
 5 | #define UTIL_FILE_STREAM_H
 6 | 
 7 | #include "util/buffered_stream.hh"
 8 | #include "util/file.hh"
 9 | 
10 | #include <stdint.h>
11 | 
12 | namespace util {
13 | 
14 | typedef BufferedStream<FileWriter> FileStream;
15 | 
16 | } // namespace
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/util/float_to_string.cc:
--------------------------------------------------------------------------------
 1 | #include "util/float_to_string.hh"
 2 | 
 3 | #include "util/double-conversion/double-conversion.h"
 4 | #include "util/double-conversion/utils.h"
 5 | 
 6 | namespace util {
 7 | namespace {
 8 | const double_conversion::DoubleToStringConverter kConverter(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0);
 9 | } // namespace
10 | 
11 | char *ToString(double value, char *to) {
12 |   double_conversion::StringBuilder builder(to, ToStringBuf<double>::kBytes);
13 |   kConverter.ToShortest(value, &builder);
14 |   return &to[builder.position()];
15 | }
16 | 
17 | char *ToString(float value, char *to) {
18 |   double_conversion::StringBuilder builder(to, ToStringBuf<float>::kBytes);
19 |   kConverter.ToShortestSingle(value, &builder);
20 |   return &to[builder.position()];
21 | }
22 | 
23 | } // namespace util
24 | 


--------------------------------------------------------------------------------
/util/float_to_string.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_FLOAT_TO_STRING_H
 2 | #define UTIL_FLOAT_TO_STRING_H
 3 | 
 4 | // Just for ToStringBuf
 5 | #include "util/integer_to_string.hh"
 6 | 
 7 | namespace util {
 8 | 
 9 | template <> struct ToStringBuf<double> {
10 |   // DoubleToStringConverter::kBase10MaximalLength + 1 for null paranoia.
11 |   static const unsigned kBytes = 19;
12 | };
13 | 
14 | // Single wasn't documented in double conversion, so be conservative and
15 | // say the same as double.
16 | template <> struct ToStringBuf<float> {
17 |   static const unsigned kBytes = 19;
18 | };
19 | 
20 | char *ToString(double value, char *to);
21 | char *ToString(float value, char *to);
22 | 
23 | } // namespace util
24 | 
25 | #endif // UTIL_FLOAT_TO_STRING_H
26 | 


--------------------------------------------------------------------------------
/util/have.hh:
--------------------------------------------------------------------------------
 1 | /* Optional packages.  You might want to integrate this with your build system e.g. config.h from ./configure. */
 2 | #ifndef UTIL_HAVE
 3 | #define UTIL_HAVE
 4 | 
 5 | #ifndef HAVE_BOOST
 6 | //#define HAVE_BOOST
 7 | #endif
 8 | 
 9 | #endif // UTIL_HAVE
10 | 


--------------------------------------------------------------------------------
/util/integer_to_string.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_INTEGER_TO_STRING_H
 2 | #define UTIL_INTEGER_TO_STRING_H
 3 | #include <cstddef>
 4 | #include <stdint.h>
 5 | 
 6 | namespace util {
 7 | 
 8 | /* These functions convert integers to strings and return the end pointer.
 9 |  */
10 | char *ToString(uint32_t value, char *to);
11 | char *ToString(uint64_t value, char *to);
12 | 
13 | // Implemented as wrappers to above
14 | char *ToString(int32_t value, char *to);
15 | char *ToString(int64_t value, char *to);
16 | 
17 | // Calls the 32-bit versions for now.
18 | char *ToString(uint16_t value, char *to);
19 | char *ToString(int16_t value, char *to);
20 | 
21 | char *ToString(const void *value, char *to);
22 | 
23 | inline char *ToString(bool value, char *to) {
24 |   *to++ = '0' + value;
25 |   return to;
26 | }
27 | 
28 | // How many bytes to reserve in the buffer for these strings:
29 | // g++ 4.9.1 doesn't work with this:
30 | // static const std::size_t kBytes = 5;
31 | // So use enum.
32 | template <class T> struct ToStringBuf;
33 | template <> struct ToStringBuf<bool> {
34 |   enum { kBytes = 1 };
35 | };
36 | template <> struct ToStringBuf<uint16_t> {
37 |   enum { kBytes = 5 };
38 | };
39 | template <> struct ToStringBuf<int16_t> {
40 |   enum { kBytes = 6 };
41 | };
42 | template <> struct ToStringBuf<uint32_t> {
43 |   enum { kBytes = 10 };
44 | };
45 | template <> struct ToStringBuf<int32_t> {
46 |   enum { kBytes = 11 };
47 | };
48 | template <> struct ToStringBuf<uint64_t> {
49 |   enum { kBytes = 20 };
50 | };
51 | template <> struct ToStringBuf<int64_t> {
52 |   // Not a typo.  2^63 has 19 digits.
53 |   enum { kBytes = 20 };
54 | };
55 | 
56 | template <> struct ToStringBuf<const void*> {
57 |   // Either 18 on 64-bit or 10 on 32-bit.
58 |   enum { kBytes = sizeof(const void*) * 2 + 2 };
59 | };
60 | 
61 | // Maximum over this and float.
62 | enum { kToStringMaxBytes = 20 };
63 | 
64 | } // namespace util
65 | 
66 | #endif // UTIL_INTEGER_TO_STRING_H
67 | 


--------------------------------------------------------------------------------
/util/integer_to_string_test.cc:
--------------------------------------------------------------------------------
 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
 2 | #include "util/integer_to_string.hh"
 3 | #include "util/string_piece.hh"
 4 | 
 5 | #define BOOST_TEST_MODULE IntegerToStringTest
 6 | #include <boost/test/unit_test.hpp>
 7 | #include <boost/lexical_cast.hpp>
 8 | 
 9 | #include <limits>
10 | 
11 | namespace util {
12 | namespace {
13 | 
14 | template <class T> void TestValue(const T value) {
15 |   char buf[ToStringBuf<T>::kBytes];
16 |   StringPiece result(buf, ToString(value, buf) - buf);
17 |   BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
18 |   if (value) {
19 |     BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
20 |   } else {
21 |     // Platforms can do void * as 0x0 or 0.
22 |     BOOST_CHECK(result == "0x0" || result == "0");
23 |   }
24 | }
25 | 
26 | template <class T> void TestCorners() {
27 |   TestValue(std::numeric_limits<T>::min());
28 |   TestValue(std::numeric_limits<T>::max());
29 |   TestValue((T)0);
30 |   TestValue((T)-1);
31 |   TestValue((T)1);
32 | }
33 | 
34 | BOOST_AUTO_TEST_CASE(Corners) {
35 |   TestCorners<uint16_t>();
36 |   TestCorners<uint32_t>();
37 |   TestCorners<uint64_t>();
38 |   TestCorners<int16_t>();
39 |   TestCorners<int32_t>();
40 |   TestCorners<int64_t>();
41 |   TestCorners<const void*>();
42 | }
43 | 
44 | template <class T> void TestAll() {
45 |   for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
46 |     TestValue(i);
47 |   }
48 |   TestValue(std::numeric_limits<T>::max());
49 | }
50 | 
51 | BOOST_AUTO_TEST_CASE(Short) {
52 |   TestAll<uint16_t>();
53 |   TestAll<int16_t>();
54 | }
55 | 
56 | template <class T> void Test10s() {
57 |   for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
58 |     TestValue(i);
59 |     TestValue(i - 1);
60 |     TestValue(i + 1);
61 |   }
62 | }
63 | 
64 | BOOST_AUTO_TEST_CASE(Tens) {
65 |   Test10s<uint64_t>();
66 |   Test10s<int64_t>();
67 |   Test10s<uint32_t>();
68 |   Test10s<int32_t>();
69 | }
70 | 
71 | BOOST_AUTO_TEST_CASE(Pointers) {
72 |   for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
73 |     TestValue((const void*)i);
74 |   }
75 |   for (uintptr_t i = 0; i < 256; ++i) {
76 |     TestValue((const void*)i);
77 |     TestValue((const void*)(i + 0xf00));
78 |   }
79 | }
80 | 
81 | }} // namespaces
82 | 


--------------------------------------------------------------------------------
/util/murmur_hash.cc:
--------------------------------------------------------------------------------
  1 | /* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
  2 |  * code is released to the public domain. For business purposes, Murmurhash is
  3 |  * under the MIT license."
  4 |  * This is modified from the original:
  5 |  * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
  6 |  * length changed to unsigned int.
  7 |  * placed in namespace util
  8 |  * add MurmurHashNative
  9 |  * default option = 0 for seed
 10 |  * ARM port from NICT
 11 |  */
 12 | 
 13 | #include "util/murmur_hash.hh"
 14 | #include <cstring>
 15 | 
 16 | namespace util {
 17 | 
 18 | //-----------------------------------------------------------------------------
 19 | // MurmurHash2, 64-bit versions, by Austin Appleby
 20 | 
 21 | // The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
 22 | // and endian-ness issues if used across multiple platforms.
 23 | 
 24 | // 64-bit hash for 64-bit platforms
 25 | 
 26 | uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed )
 27 | {
 28 |   const uint64_t m = 0xc6a4a7935bd1e995ULL;
 29 |   const int r = 47;
 30 | 
 31 |   uint64_t h = seed ^ (len * m);
 32 | 
 33 | #if defined(__arm) || defined(__arm__)
 34 |   const size_t ksize = sizeof(uint64_t);
 35 |   const unsigned char * data = (const unsigned char *)key;
 36 |   const unsigned char * end = data + (std::size_t)(len/8) * ksize;
 37 | #else
 38 |   const uint64_t * data = (const uint64_t *)key;
 39 |   const uint64_t * end = data + (len/8);
 40 | #endif
 41 | 
 42 |   while(data != end)
 43 |   {
 44 | #if defined(__arm) || defined(__arm__)
 45 |     uint64_t k;
 46 |     memcpy(&k, data, ksize);
 47 |     data += ksize;
 48 | #else
 49 |     uint64_t k = *data++;
 50 | #endif
 51 | 
 52 |     k *= m;
 53 |     k ^= k >> r;
 54 |     k *= m;
 55 | 
 56 |     h ^= k;
 57 |     h *= m;
 58 |   }
 59 | 
 60 |   const unsigned char * data2 = (const unsigned char*)data;
 61 | 
 62 |   switch(len & 7)
 63 |   {
 64 |   case 7: h ^= uint64_t(data2[6]) << 48;
 65 |   case 6: h ^= uint64_t(data2[5]) << 40;
 66 |   case 5: h ^= uint64_t(data2[4]) << 32;
 67 |   case 4: h ^= uint64_t(data2[3]) << 24;
 68 |   case 3: h ^= uint64_t(data2[2]) << 16;
 69 |   case 2: h ^= uint64_t(data2[1]) << 8;
 70 |   case 1: h ^= uint64_t(data2[0]);
 71 |           h *= m;
 72 |   };
 73 | 
 74 |   h ^= h >> r;
 75 |   h *= m;
 76 |   h ^= h >> r;
 77 | 
 78 |   return h;
 79 | }
 80 | 
 81 | 
 82 | // 64-bit hash for 32-bit platforms
 83 | 
 84 | uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed )
 85 | {
 86 |   const unsigned int m = 0x5bd1e995;
 87 |   const int r = 24;
 88 | 
 89 |   unsigned int h1 = seed ^ len;
 90 |   unsigned int h2 = 0;
 91 | 
 92 | #if defined(__arm) || defined(__arm__)
 93 |   size_t ksize = sizeof(unsigned int);
 94 |   const unsigned char * data = (const unsigned char *)key;
 95 | #else
 96 |   const unsigned int * data = (const unsigned int *)key;
 97 | #endif
 98 | 
 99 |   unsigned int k1, k2;
100 |   while(len >= 8)
101 |   {
102 | #if defined(__arm) || defined(__arm__)
103 |     memcpy(&k1, data, ksize);
104 |     data += ksize;
105 |     memcpy(&k2, data, ksize);
106 |     data += ksize;
107 | #else
108 |     k1 = *data++;
109 |     k2 = *data++;
110 | #endif
111 | 
112 |     k1 *= m; k1 ^= k1 >> r; k1 *= m;
113 |     h1 *= m; h1 ^= k1;
114 |     len -= 4;
115 | 
116 |     k2 *= m; k2 ^= k2 >> r; k2 *= m;
117 |     h2 *= m; h2 ^= k2;
118 |     len -= 4;
119 |   }
120 | 
121 |   if(len >= 4)
122 |   {
123 | #if defined(__arm) || defined(__arm__)
124 |     memcpy(&k1, data, ksize);
125 |     data += ksize;
126 | #else
127 |     k1 = *data++;
128 | #endif
129 |     k1 *= m; k1 ^= k1 >> r; k1 *= m;
130 |     h1 *= m; h1 ^= k1;
131 |     len -= 4;
132 |   }
133 | 
134 |   switch(len)
135 |   {
136 |   case 3: h2 ^= ((unsigned char*)data)[2] << 16;
137 |   case 2: h2 ^= ((unsigned char*)data)[1] << 8;
138 |   case 1: h2 ^= ((unsigned char*)data)[0];
139 |       h2 *= m;
140 |   };
141 | 
142 |   h1 ^= h2 >> 18; h1 *= m;
143 |   h2 ^= h1 >> 22; h2 *= m;
144 |   h1 ^= h2 >> 17; h1 *= m;
145 |   h2 ^= h1 >> 19; h2 *= m;
146 | 
147 |   uint64_t h = h1;
148 | 
149 |   h = (h << 32) | h2;
150 | 
151 |   return h;
152 | }
153 | 
154 | // Trick to test for 64-bit architecture at compile time.
155 | namespace {
156 | #ifdef __clang__
157 | #pragma clang diagnostic push
158 | #pragma clang diagnostic ignored "-Wunused-function"
159 | #endif
160 | template <unsigned L> inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) {
161 |   return MurmurHash64A(key, len, seed);
162 | }
163 | template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) {
164 |   return MurmurHash64B(key, len, seed);
165 | }
166 | #ifdef __clang__
167 | #pragma clang diagnostic pop
168 | #endif
169 | } // namespace
170 | 
171 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) {
172 |   return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);
173 | }
174 | 
175 | } // namespace util
176 | 


--------------------------------------------------------------------------------
/util/murmur_hash.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_MURMUR_HASH_H
 2 | #define UTIL_MURMUR_HASH_H
 3 | #include <cstddef>
 4 | #include <stdint.h>
 5 | 
 6 | namespace util {
 7 | 
 8 | // 64-bit machine version
 9 | uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0);
10 | // 32-bit machine version (not the same function as above)
11 | uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0);
12 | // Use the version for this arch.  Because the values differ across
13 | // architectures, really only use it for in-memory structures.
14 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);
15 | 
16 | } // namespace util
17 | 
18 | #endif // UTIL_MURMUR_HASH_H
19 | 


--------------------------------------------------------------------------------
/util/mutable_vocab.cc:
--------------------------------------------------------------------------------
 1 | #include "util/mutable_vocab.hh"
 2 | 
 3 | #include "util/murmur_hash.hh"
 4 | 
 5 | namespace util {
 6 | 
 7 | MutableVocab::MutableVocab() {
 8 |   strings_.push_back(StringPiece("<unk>"));
 9 | }
10 | 
11 | MutableVocab::ID MutableVocab::Find(const StringPiece &str) const {
12 |   Map::ConstIterator it;
13 |   if (map_.Find(util::MurmurHashNative(str.data(), str.size()), it)) {
14 |     return it->id;
15 |   } else {
16 |     return kUNK;
17 |   }
18 | }
19 | 
20 | uint32_t MutableVocab::FindOrInsert(const StringPiece &str) {
21 |   MutableVocabInternal entry;
22 |   entry.key = util::MurmurHashNative(str.data(), str.size());
23 |   Map::MutableIterator it;
24 |   if (map_.FindOrInsert(entry, it)) {
25 |     return it->id;
26 |   }
27 |   it->id = strings_.size();
28 |   
29 |   char *copied = static_cast<char*>(piece_backing_.Allocate(str.size()));
30 |   memcpy(copied, str.data(), str.size());
31 |   strings_.push_back(StringPiece(copied, str.size()));
32 |   return it->id;
33 | }
34 | 
35 | } // namespace util
36 | 


--------------------------------------------------------------------------------
/util/mutable_vocab.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_MUTABLE_VOCAB__
 2 | #define UTIL_MUTABLE_VOCAB__
 3 | 
 4 | /* A vocabulary mapping class that's mutable at runtime.  The kenlm code has
 5 |  * a specialized immutable vocabulary.
 6 |  */
 7 | 
 8 | #include "util/pool.hh"
 9 | #include "util/probing_hash_table.hh"
10 | #include "util/string_piece.hh"
11 | 
12 | #include <stdint.h>
13 | 
14 | namespace util {
15 | 
16 | #pragma pack(push)
17 | #pragma pack(4)
18 | struct MutableVocabInternal {
19 |   typedef uint64_t Key;
20 |   uint64_t GetKey() const { return key; }
21 |   void SetKey(uint64_t to) { key = to; }
22 | 
23 |   uint64_t key;
24 |   uint32_t id;
25 | };
26 | #pragma pack(pop)
27 |  
28 | class MutableVocab {
29 |   public:
30 |     typedef uint32_t ID;
31 | 
32 |     static const ID kUNK = 0;
33 | 
34 |     MutableVocab();
35 |     
36 |     uint32_t Find(const StringPiece &str) const;
37 | 
38 |     ID FindOrInsert(const StringPiece &str);
39 | 
40 |     StringPiece String(ID id) const {
41 |       return strings_[id];
42 |     }
43 | 
44 |     // Includes kUNK.
45 |     std::size_t Size() const { return strings_.size(); }
46 |     
47 |   private:
48 |     util::Pool piece_backing_;
49 | 
50 |     typedef util::AutoProbing<MutableVocabInternal, util::IdentityHash> Map;
51 |     Map map_;
52 | 
53 |     std::vector<StringPiece> strings_;
54 | };
55 | 
56 | } // namespace util
57 | #endif // UTIL_MUTABLE_VOCAB__
58 | 


--------------------------------------------------------------------------------
/util/mutable_vocab_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/mutable_vocab.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE MutableVocabTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | namespace util {
 7 | namespace {
 8 | 
 9 | BOOST_AUTO_TEST_CASE(small) {
10 |   MutableVocab vocab;
11 |   BOOST_CHECK_EQUAL(1, vocab.FindOrInsert("Foo"));
12 |   BOOST_CHECK_EQUAL(2, vocab.Size());
13 |   BOOST_CHECK_EQUAL(1, vocab.Find("Foo"));
14 |   BOOST_CHECK_EQUAL("Foo", vocab.String(1));
15 | }
16 | 
17 | } // namespace
18 | } // namespace util
19 | 


--------------------------------------------------------------------------------
/util/object_pool.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_OBJECT_POOL_H
 2 | #define UTIL_OBJECT_POOL_H
 3 | 
 4 | #include "util/fixed_array.hh"
 5 | 
 6 | #include <vector>
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | namespace util {
11 | 
12 | template <class T> class ObjectPool {
13 |   public:
14 |     ObjectPool() {}
15 | 
16 |     template <typename... Construct> T *Allocate(Construct... construct) {
17 |       if (free_list_.empty() ||
18 |           (free_list_.back().begin() + Capacity(free_list_.size()) == free_list_.back().end())) {
19 |         free_list_.emplace_back(Capacity(free_list_.size() + 1));
20 |       }
21 |       free_list_.back().push_back(construct...);
22 |       return &free_list_.back().back();
23 |     }
24 | 
25 |     void FreeAll() {
26 |       free_list_.clear();
27 |     }
28 | 
29 |   private:
30 |     static std::size_t Capacity(std::size_t index) {
31 |       return 1ULL << index;
32 |     }
33 | 
34 |     std::vector<util::FixedArray<T> > free_list_;
35 | };
36 | 
37 | } // namespace util
38 | 
39 | #endif // UTIL_OBJECT_POOL_H
40 | 


--------------------------------------------------------------------------------
/util/pcqueue_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/pcqueue.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE PCQueueTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | #include <thread>
 7 | 
 8 | namespace util {
 9 | namespace {
10 | 
11 | BOOST_AUTO_TEST_CASE(SingleThread) {
12 |   PCQueue<int> queue(10);
13 |   for (int i = 0; i < 10; ++i) {
14 |     queue.Produce(i);
15 |   }
16 |   for (int i = 0; i < 10; ++i) {
17 |     BOOST_CHECK_EQUAL(i, queue.Consume());
18 |   }
19 | }
20 | 
21 | BOOST_AUTO_TEST_CASE(SingleInSingleOut) {
22 |   PCQueue<int> queue(15);
23 |   std::thread writer([&queue]() {
24 |       for (int i = 0; i < 100; ++i) {
25 |         queue.Produce(i);
26 |       }
27 |   });
28 |   for (int i = 0; i < 100; ++i) {
29 |     BOOST_CHECK_EQUAL(i, queue.Consume());
30 |   }
31 |   writer.join();
32 | }
33 | 
34 | void MultipleWriters() {
35 |   const unsigned kCount = 2000;
36 |   const unsigned kNumThreads = 4;
37 |   PCQueue<unsigned> queue(13);
38 |   auto writer = [&queue, kCount]() {
39 |     for (unsigned i = 0; i < kCount; ++i) {
40 |       queue.Produce(i);
41 |     }
42 |   };
43 |   std::vector<std::thread> threads;
44 |   for (unsigned i = 0; i < kNumThreads; ++i) {
45 |     threads.emplace_back(writer);
46 |   }
47 |   unsigned seen[kCount] = {0};
48 |   for (unsigned i = 0; i < kCount * kNumThreads; ++i) {
49 |     unsigned got = queue.Consume();
50 |     BOOST_CHECK_LT(got, kCount);
51 |     seen[got]++;
52 |     // Since each thread generates in order, counts should be monotonically non-increasing.
53 |     BOOST_CHECK(!got || seen[got] <= seen[got - 1]);
54 |   }
55 |   for (unsigned i = 0; i < kCount; ++i) {
56 |     BOOST_CHECK_EQUAL(seen[i], kNumThreads);
57 |   }
58 |   for (std::thread &t : threads) {
59 |     t.join();
60 |   }
61 | }
62 | 
63 | }
64 | } // namespace util
65 | 


--------------------------------------------------------------------------------
/util/pool.cc:
--------------------------------------------------------------------------------
 1 | #include "util/pool.hh"
 2 | 
 3 | #include "util/scoped.hh"
 4 | 
 5 | #include <cstdlib>
 6 | 
 7 | #include <algorithm>
 8 | 
 9 | namespace util {
10 | 
11 | Pool::Pool() {
12 |   current_ = NULL;
13 |   current_end_ = NULL;
14 | }
15 | 
16 | Pool::~Pool() {
17 |   FreeAll();
18 | }
19 | 
20 | void Pool::FreeAll() {
21 |   for (std::vector<void *>::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) {
22 |     free(*i);
23 |   }
24 |   free_list_.clear();
25 |   current_ = NULL;
26 |   current_end_ = NULL;
27 | }
28 | 
29 | void *Pool::More(std::size_t size) {
30 |   std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size);
31 |   uint8_t *ret = static_cast<uint8_t*>(MallocOrThrow(amount));
32 |   free_list_.push_back(ret);
33 |   current_ = ret + size;
34 |   current_end_ = ret + amount;
35 |   return ret;
36 | }
37 | 
38 | } // namespace util
39 | 


--------------------------------------------------------------------------------
/util/pool.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UTIL_POOL_H
  2 | #define UTIL_POOL_H
  3 | 
  4 | #include <cassert>
  5 | #include <cstring>
  6 | #include <vector>
  7 | 
  8 | #include <stdint.h>
  9 | 
 10 | namespace util {
 11 | 
 12 | /* Very simple pool.  It can only allocate memory.  And all of the memory it
 13 |  * allocates must be freed at the same time.
 14 |  */
 15 | class Pool {
 16 |   public:
 17 |     Pool();
 18 | 
 19 |     ~Pool();
 20 | 
 21 |     void *Allocate(std::size_t size) {
 22 |       void *ret = current_;
 23 |       current_ += size;
 24 |       if (current_ > current_end_) {
 25 |         ret = More(size);
 26 |       }
 27 | #ifdef DEBUG
 28 |       base_check_ = ret;
 29 | #endif
 30 |       return ret;
 31 |     }
 32 | 
 33 |     /** Extend (or contract) the most recent allocation.
 34 |      * @param base The base pointer of the allocation. This must must have been
 35 |      *   returned by the MOST RECENT call to Allocate or Continue.
 36 |      * @param additional Change in the size.
 37 |      *
 38 |      * In most cases, more memory from the same page is used, in which case
 39 |      * base is unchanged and the function returns false.
 40 |      * If the page runs out, a new page is created and the memory (from base)
 41 |      * is copied.  The function returns true.
 42 |      *
 43 |      * @return Whether the base had to be changed due to allocating a page.
 44 |      */
 45 |     bool Continue(void *&base, std::ptrdiff_t additional) {
 46 | #ifdef DEBUG
 47 |       assert(base == base_check_);
 48 | #endif
 49 |       current_ += additional;
 50 |       if (current_ > current_end_) {
 51 |         std::size_t new_total = current_ - static_cast<uint8_t*>(base);
 52 |         void *new_base = More(new_total);
 53 |         std::memcpy(new_base, base, new_total - additional);
 54 |         base = new_base;
 55 | #ifdef DEBUG
 56 |         base_check_ = base;
 57 | #endif
 58 |         return true;
 59 |       }
 60 |       return false;
 61 |     }
 62 | 
 63 |     void FreeAll();
 64 | 
 65 |   private:
 66 |     void *More(std::size_t size);
 67 | 
 68 |     std::vector<void *> free_list_;
 69 | 
 70 |     uint8_t *current_, *current_end_;
 71 | 
 72 | #ifdef DEBUG
 73 |     // For debugging, check that Continue came from the most recent call.
 74 |     void *base_check_;
 75 | #endif // DEBUG
 76 | 
 77 |     // no copying
 78 |     Pool(const Pool &);
 79 |     Pool &operator=(const Pool &);
 80 | };
 81 | 
 82 | /**
 83 |  * Pool designed to allow limited freeing.
 84 |  * Keeps a linked list of free elements in the free spaces.
 85 |  * Will not reduce in size until FreeAll is called.
 86 |  */
 87 | class FreePool {
 88 |   public:
 89 |     explicit FreePool(std::size_t element_size)
 90 |       : free_list_(NULL), element_size_(element_size) {}
 91 | 
 92 |     void *Allocate() {
 93 |       if (free_list_) {
 94 |         void *ret = free_list_;
 95 |         free_list_ = *reinterpret_cast<void**>(free_list_);
 96 |         return ret;
 97 |       } else {
 98 |         return backing_.Allocate(element_size_);
 99 |       }
100 |     }
101 | 
102 |     void Free(void *ptr) {
103 |       *reinterpret_cast<void**>(ptr) = free_list_;
104 |       free_list_ = ptr;
105 |     }
106 | 
107 |     std::size_t ElementSize() const { return element_size_; }
108 | 
109 |   private:
110 |     void *free_list_;
111 | 
112 |     Pool backing_;
113 | 
114 |     const std::size_t element_size_;
115 | };
116 | 
117 | } // namespace util
118 | 
119 | #endif // UTIL_POOL_H
120 | 


--------------------------------------------------------------------------------
/util/probing_hash_table_test.cc:
--------------------------------------------------------------------------------
  1 | #include "util/probing_hash_table.hh"
  2 | 
  3 | #include "util/murmur_hash.hh"
  4 | #include "util/scoped.hh"
  5 | 
  6 | #define BOOST_TEST_MODULE ProbingHashTableTest
  7 | #include <boost/test/unit_test.hpp>
  8 | #include <boost/scoped_array.hpp>
  9 | #include <boost/functional/hash.hpp>
 10 | #include <cstdio>
 11 | #include <cstdlib>
 12 | #include <cstring>
 13 | #include <stdint.h>
 14 | 
 15 | namespace util {
 16 | namespace {
 17 | 
 18 | struct Entry {
 19 |   unsigned char key;
 20 |   typedef unsigned char Key;
 21 | 
 22 |   unsigned char GetKey() const {
 23 |     return key;
 24 |   }
 25 | 
 26 |   void SetKey(unsigned char to) {
 27 |     key = to;
 28 |   }
 29 | 
 30 |   uint64_t GetValue() const {
 31 |     return value;
 32 |   }
 33 | 
 34 |   uint64_t value;
 35 | };
 36 | 
 37 | typedef ProbingHashTable<Entry, boost::hash<unsigned char> > Table;
 38 | 
 39 | BOOST_AUTO_TEST_CASE(simple) {
 40 |   size_t size = Table::Size(10, 1.2);
 41 |   boost::scoped_array<char> mem(new char[size]);
 42 |   memset(mem.get(), 0, size);
 43 | 
 44 |   Table table(mem.get(), size);
 45 |   const Entry *i = NULL;
 46 |   BOOST_CHECK(!table.Find(2, i));
 47 |   Entry to_ins;
 48 |   to_ins.key = 3;
 49 |   to_ins.value = 328920;
 50 |   table.Insert(to_ins);
 51 |   BOOST_REQUIRE(table.Find(3, i));
 52 |   BOOST_CHECK_EQUAL(3, i->GetKey());
 53 |   BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue());
 54 |   BOOST_CHECK(!table.Find(2, i));
 55 | }
 56 | 
 57 | struct Entry64 {
 58 |   uint64_t key;
 59 |   typedef uint64_t Key;
 60 | 
 61 |   Entry64() {}
 62 | 
 63 |   explicit Entry64(uint64_t key_in) {
 64 |     key = key_in;
 65 |   }
 66 | 
 67 |   Key GetKey() const { return key; }
 68 |   void SetKey(uint64_t to) { key = to; }
 69 | };
 70 | 
 71 | struct MurmurHashEntry64 {
 72 |   std::size_t operator()(uint64_t value) const {
 73 |     return util::MurmurHash64A(&value, 8);
 74 |   }
 75 | };
 76 | 
 77 | typedef ProbingHashTable<Entry64, MurmurHashEntry64> Table64;
 78 | 
 79 | BOOST_AUTO_TEST_CASE(Double) {
 80 |   for (std::size_t initial = 19; initial < 30; ++initial) {
 81 |     size_t size = Table64::Size(initial, 1.2);
 82 |     scoped_malloc mem(MallocOrThrow(size));
 83 |     Table64 table(mem.get(), size, std::numeric_limits<uint64_t>::max());
 84 |     table.Clear();
 85 |     for (uint64_t i = 0; i < 19; ++i) {
 86 |       table.Insert(Entry64(i));
 87 |     }
 88 |     table.CheckConsistency();
 89 |     mem.call_realloc(table.DoubleTo());
 90 |     table.Double(mem.get());
 91 |     table.CheckConsistency();
 92 |     for (uint64_t i = 20; i < 40 ; ++i) {
 93 |       table.Insert(Entry64(i));
 94 |     }
 95 |     mem.call_realloc(table.DoubleTo());
 96 |     table.Double(mem.get());
 97 |     table.CheckConsistency();
 98 |   }
 99 | }
100 | 
101 | } // namespace
102 | } // namespace util
103 | 


--------------------------------------------------------------------------------
/util/scoped.cc:
--------------------------------------------------------------------------------
 1 | #include "util/scoped.hh"
 2 | 
 3 | #include <cstdlib>
 4 | #if !defined(_WIN32) && !defined(_WIN64)
 5 | #include <sys/mman.h>
 6 | #endif
 7 | 
 8 | namespace util {
 9 | 
10 | // TODO: if we're really under memory pressure, don't allocate memory to
11 | // display the error.
12 | MallocException::MallocException(std::size_t requested) throw() {
13 |   *this << "for " << requested << " bytes ";
14 | }
15 | 
16 | MallocException::~MallocException() throw() {}
17 | 
18 | namespace {
19 | void *InspectAddr(void *addr, std::size_t requested, const char *func_name) {
20 |   UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name);
21 |   return addr;
22 | }
23 | } // namespace
24 | 
25 | void *MallocOrThrow(std::size_t requested) {
26 |   return InspectAddr(std::malloc(requested), requested, "malloc");
27 | }
28 | 
29 | void *CallocOrThrow(std::size_t requested) {
30 |   return InspectAddr(std::calloc(requested, 1), requested, "calloc");
31 | }
32 | 
33 | void scoped_malloc::call_realloc(std::size_t requested) {
34 |   p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
35 | }
36 | 
37 | void AdviseHugePages(const void *addr, std::size_t size) {
38 | #if MADV_HUGEPAGE
39 |   madvise((void*)addr, size, MADV_HUGEPAGE);
40 | #endif
41 | }
42 | 
43 | } // namespace util
44 | 


--------------------------------------------------------------------------------
/util/scoped.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UTIL_SCOPED_H
  2 | #define UTIL_SCOPED_H
  3 | /* Other scoped objects in the style of scoped_ptr. */
  4 | 
  5 | #include "util/exception.hh"
  6 | #include <cstddef>
  7 | #include <cstdlib>
  8 | 
  9 | namespace util {
 10 | 
 11 | class MallocException : public ErrnoException {
 12 |   public:
 13 |     explicit MallocException(std::size_t requested) throw();
 14 |     ~MallocException() throw();
 15 | };
 16 | 
 17 | void *MallocOrThrow(std::size_t requested);
 18 | void *CallocOrThrow(std::size_t requested);
 19 | 
 20 | /* Unfortunately, defining the operator* for void * makes the compiler complain.
 21 |  * So scoped is specialized to void.  This includes the functionality common to
 22 |  * both, namely everything except reference.
 23 |  */
 24 | template <class T, class Closer> class scoped_base {
 25 |   public:
 26 |     explicit scoped_base(T *p = NULL) : p_(p) {}
 27 | 
 28 |     ~scoped_base() { Closer::Close(p_); }
 29 | 
 30 | #if __cplusplus >= 201103L
 31 |     scoped_base(scoped_base &&from) noexcept : p_(from.p_) {
 32 |       from.p_ = nullptr;
 33 |     }
 34 |     
 35 |     scoped_base &operator=(scoped_base &&from) noexcept {
 36 |       if (this != &from) {
 37 |         Closer::Close(p_);
 38 |         p_ = from.p_;
 39 |         from.p_ = nullptr;
 40 |       }
 41 |       return *this;
 42 |     }
 43 | #endif
 44 | 
 45 |     void reset(T *p = NULL) {
 46 |       scoped_base other(p_);
 47 |       p_ = p;
 48 |     }
 49 | 
 50 |     T *get() { return p_; }
 51 |     const T *get() const { return p_; }
 52 | 
 53 |     T *operator->() { return p_; }
 54 |     const T *operator->() const { return p_; }
 55 | 
 56 |     T *release() {
 57 |       T *ret = p_;
 58 |       p_ = NULL;
 59 |       return ret;
 60 |     }
 61 | 
 62 |   protected:
 63 |     T *p_;
 64 | 
 65 | #if __cplusplus >= 201103L
 66 |   public:
 67 |     scoped_base(const scoped_base &) = delete;
 68 |     scoped_base &operator=(const scoped_base &) = delete;
 69 | #else
 70 |   private:
 71 |     scoped_base(const scoped_base &);
 72 |     scoped_base &operator=(const scoped_base &);
 73 | #endif
 74 | };
 75 | 
 76 | template <class T, class Closer> class scoped : public scoped_base<T, Closer> {
 77 |   public:
 78 |     explicit scoped(T *p = NULL) : scoped_base<T, Closer>(p) {}
 79 | 
 80 |     T &operator*() { return *scoped_base<T, Closer>::p_; }
 81 |     const T&operator*() const { return *scoped_base<T, Closer>::p_; }
 82 | };
 83 | 
 84 | template <class Closer> class scoped<void, Closer> : public scoped_base<void, Closer> {
 85 |   public:
 86 |     explicit scoped(void *p = NULL) : scoped_base<void, Closer>(p) {}
 87 | };
 88 | 
 89 | /* Closer for c functions like std::free and cmph cleanup functions */
 90 | template <class T, void (*clean)(T*)> struct scoped_c_forward {
 91 |   static void Close(T *p) { clean(p); }
 92 | };
 93 | // Call a C function to delete stuff
 94 | template <class T, void (*clean)(T*)> class scoped_c : public scoped<T, scoped_c_forward<T, clean> > {
 95 |   public:
 96 |     explicit scoped_c(T *p = NULL) : scoped<T, scoped_c_forward<T, clean> >(p) {}
 97 | };
 98 | 
 99 | class scoped_malloc : public scoped_c<void, std::free> {
100 |   public:
101 |     explicit scoped_malloc(void *p = NULL) : scoped_c<void, std::free>(p) {}
102 | 
103 |     explicit scoped_malloc(std::size_t size) : scoped_c<void, std::free>(MallocOrThrow(size)) {}
104 | 
105 |     void call_realloc(std::size_t to);
106 | };
107 | 
108 | /* scoped_array using delete[] */
109 | struct scoped_delete_array_forward {
110 |   template <class T> static void Close(T *p) { delete [] p; }
111 | };
112 | // Hat tip to boost.
113 | template <class T> class scoped_array : public scoped<T, scoped_delete_array_forward> {
114 |   public:
115 |     explicit scoped_array(T *p = NULL) : scoped<T, scoped_delete_array_forward>(p) {}
116 | 
117 |     T &operator[](std::size_t idx) { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
118 |     const T &operator[](std::size_t idx) const { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
119 | };
120 | 
121 | /* scoped_ptr using delete.  If only there were a template typedef. */
122 | struct scoped_delete_forward {
123 |   template <class T> static void Close(T *p) { delete p; }
124 | };
125 | template <class T> class scoped_ptr : public scoped<T, scoped_delete_forward> {
126 |   public:
127 |     explicit scoped_ptr(T *p = NULL) : scoped<T, scoped_delete_forward>(p) {}
128 | };
129 | 
130 | void AdviseHugePages(const void *addr, std::size_t size);
131 | 
132 | } // namespace util
133 | 
134 | #endif // UTIL_SCOPED_H
135 | 


--------------------------------------------------------------------------------
/util/spaces.cc:
--------------------------------------------------------------------------------
1 | #include "util/spaces.hh"
2 | 
3 | namespace util {
4 | 
5 | // Sigh this is the only way I could come up with to do a _const_ bool.  It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
6 | const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
7 | 
8 | } // namespace util
9 | 


--------------------------------------------------------------------------------
/util/spaces.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_SPACES_H
 2 | #define UTIL_SPACES_H
 3 | 
 4 | // bool array of spaces.
 5 | 
 6 | namespace util {
 7 | 
 8 | extern const bool kSpaces[256];
 9 | 
10 | } // namespace util
11 | 
12 | #endif // UTIL_SPACES_H
13 | 


--------------------------------------------------------------------------------
/util/string_stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STRING_STREAM_H
 2 | #define UTIL_STRING_STREAM_H
 3 | 
 4 | #include "util/fake_ostream.hh"
 5 | 
 6 | #include <cassert>
 7 | #include <string>
 8 | 
 9 | namespace util {
10 | 
11 | class StringStream : public FakeOStream<StringStream> {
12 |   public:
13 |     StringStream() {}
14 | 
15 |     StringStream &flush() { return *this; }
16 | 
17 |     StringStream &write(const void *data, std::size_t length) {
18 |       out_.append(static_cast<const char*>(data), length);
19 |       return *this;
20 |     }
21 | 
22 |     const std::string &str() const { return out_; }
23 | 
24 |     void str(const std::string &val) { out_ = val; }
25 | 
26 |     void swap(std::string &str) { std::swap(out_, str); }
27 | 
28 |   protected:
29 |     friend class FakeOStream<StringStream>;
30 |     char *Ensure(std::size_t amount) {
31 |       std::size_t current = out_.size();
32 |       out_.resize(out_.size() + amount);
33 |       return &out_[current];
34 |     }
35 | 
36 |     void AdvanceTo(char *to) {
37 |       assert(to <= &*out_.end());
38 |       assert(to >= &*out_.begin());
39 |       out_.resize(to - &*out_.begin());
40 |     }
41 | 
42 |   private:
43 |     std::string out_;
44 | };
45 | 
46 | } // namespace
47 | 
48 | #endif // UTIL_STRING_STREAM_H
49 | 


--------------------------------------------------------------------------------
/util/string_stream_test.cc:
--------------------------------------------------------------------------------
 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
 2 | #define BOOST_TEST_MODULE FakeOStreamTest
 3 | 
 4 | #include "util/string_stream.hh"
 5 | #include <boost/test/unit_test.hpp>
 6 | #include <boost/lexical_cast.hpp>
 7 | 
 8 | #include <cstddef>
 9 | #include <limits>
10 | 
11 | namespace util { namespace {
12 | 
13 | template <class T> void TestEqual(const T value) {
14 |   StringStream strme;
15 |   strme << value;
16 |   BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), strme.str());
17 | }
18 | 
19 | template <class T> void TestCorners() {
20 |   TestEqual(std::numeric_limits<T>::max());
21 |   TestEqual(std::numeric_limits<T>::min());
22 |   TestEqual(static_cast<T>(0));
23 |   TestEqual(static_cast<T>(-1));
24 |   TestEqual(static_cast<T>(1));
25 | }
26 | 
27 | BOOST_AUTO_TEST_CASE(Integer) {
28 |   TestCorners<char>();
29 |   TestCorners<signed char>();
30 |   TestCorners<unsigned char>();
31 | 
32 |   TestCorners<short>();
33 |   TestCorners<signed short>();
34 |   TestCorners<unsigned short>();
35 | 
36 |   TestCorners<int>();
37 |   TestCorners<unsigned int>();
38 |   TestCorners<signed int>();
39 | 
40 |   TestCorners<long>();
41 |   TestCorners<unsigned long>();
42 |   TestCorners<signed long>();
43 | 
44 |   TestCorners<long long>();
45 |   TestCorners<unsigned long long>();
46 |   TestCorners<signed long long>();
47 | 
48 |   TestCorners<std::size_t>();
49 | }
50 | 
51 | enum TinyEnum { EnumValue };
52 | 
53 | BOOST_AUTO_TEST_CASE(EnumCase) {
54 |   TestEqual(EnumValue);
55 | }
56 | 
57 | BOOST_AUTO_TEST_CASE(Strings) {
58 |   TestEqual("foo");
59 |   const char *a = "bar";
60 |   TestEqual(a);
61 |   StringPiece piece("abcdef");
62 |   TestEqual(piece);
63 |   TestEqual(StringPiece());
64 | 
65 |   char non_const[3];
66 |   non_const[0] = 'b';
67 |   non_const[1] = 'c';
68 |   non_const[2] = 0;
69 | 
70 |   StringStream out;
71 |   out << "a" << non_const << 'c';
72 |   BOOST_CHECK_EQUAL("abcc", out.str());
73 | 
74 |   // Now test as a separate object.
75 |   StringStream stream;
76 |   stream << "a" << non_const << 'c' << piece;
77 |   BOOST_CHECK_EQUAL("abccabcdef", stream.str());
78 | }
79 | 
80 | }} // namespaces
81 | 


--------------------------------------------------------------------------------
/util/tokenize_piece_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/tokenize_piece.hh"
 2 | #include "util/string_piece.hh"
 3 | 
 4 | #define BOOST_TEST_MODULE TokenIteratorTest
 5 | #include <boost/test/unit_test.hpp>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | namespace util {
10 | namespace {
11 | 
12 | BOOST_AUTO_TEST_CASE(pipe_pipe_none) {
13 |   const char str[] = "nodelimit at all";
14 |   TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
15 |   BOOST_REQUIRE(it);
16 |   BOOST_CHECK_EQUAL(StringPiece(str), *it);
17 |   ++it;
18 |   BOOST_CHECK(!it);
19 | }
20 | BOOST_AUTO_TEST_CASE(pipe_pipe_two) {
21 |   const char str[] = "|||";
22 |   TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
23 |   BOOST_REQUIRE(it);
24 |   BOOST_CHECK_EQUAL(StringPiece(), *it);
25 |   ++it;
26 |   BOOST_REQUIRE(it);
27 |   BOOST_CHECK_EQUAL(StringPiece(), *it);
28 |   ++it;
29 |   BOOST_CHECK(!it);
30 | }
31 | 
32 | BOOST_AUTO_TEST_CASE(remove_empty) {
33 |   const char str[] = "|||";
34 |   TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
35 |   BOOST_CHECK(!it);
36 | }
37 | 
38 | BOOST_AUTO_TEST_CASE(remove_empty_keep) {
39 |   const char str[] = " |||";
40 |   TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
41 |   BOOST_REQUIRE(it);
42 |   BOOST_CHECK_EQUAL(StringPiece(" "), *it);
43 |   ++it;
44 |   BOOST_CHECK(!it);
45 | }
46 | 
47 | } // namespace
48 | } // namespace util
49 | 


--------------------------------------------------------------------------------
/util/utf8.cc:
--------------------------------------------------------------------------------
 1 | #include "util/utf8.hh"
 2 | 
 3 | #include "util/string_piece.hh"
 4 | 
 5 | namespace util {
 6 | 
 7 | NotUTF8Exception::NotUTF8Exception(const StringPiece &) throw() {}
 8 | 
 9 | NotUTF8Exception::~NotUTF8Exception() throw() {}
10 | 
11 | bool IsUTF8(const StringPiece &str) {
12 |   try {
13 |     for (char32_t character : DecodeUTF8Range(str)) {
14 |       (void)character; /*unused variable */
15 |     }
16 |     return true;
17 |   } catch (const NotUTF8Exception &) {
18 |     return false;
19 |   }
20 | }
21 | 
22 | } // namespace util
23 | 


--------------------------------------------------------------------------------
/util/utf8_icu.hh:
--------------------------------------------------------------------------------
 1 | /* Utilities for UTF-8 that require ICU.  */
 2 | 
 3 | #ifndef UTIL_UTF8_ICU
 4 | #define UTIL_UTF8_ICU
 5 | 
 6 | #include "util/string_piece.hh"
 7 | 
 8 | #include <exception>
 9 | #include <string>
10 | 
11 | #include <unicode/utypes.h>
12 | 
13 | U_NAMESPACE_BEGIN
14 | class UnicodeString;
15 | U_NAMESPACE_END
16 | 
17 | namespace util {
18 | 
19 | class NormalizeException : public std::exception {
20 |   public:
21 |     NormalizeException(const StringPiece &original, UErrorCode code) throw();
22 |     ~NormalizeException() throw() {}
23 | 
24 |     const char *what() const throw() { return what_.c_str(); }
25 | 
26 |   private:
27 |     std::string original_;
28 | 
29 |     std::string what_;
30 | };
31 | 
32 | 
33 | class ICUStupidlyUses32BitIntegersException : public std::exception {
34 |   public:
35 |     ~ICUStupidlyUses32BitIntegersException();
36 |     const char *what() const throw();
37 | };
38 | 
39 | // TODO: Implement these in a way that doesn't botch Turkish.
40 | void ToLower(const StringPiece &in, std::string &out);
41 | 
42 | void Normalize(const U_ICU_NAMESPACE::UnicodeString &in, U_ICU_NAMESPACE::UnicodeString &out);
43 | void Normalize(const StringPiece &in, std::string &out);
44 | 
45 | class UnsupportedLanguageException : public std::exception {
46 |   public:
47 |     explicit UnsupportedLanguageException(const StringPiece &language) throw();
48 |     ~UnsupportedLanguageException() throw() {}
49 | 
50 |     const char *what() const throw() { return what_.c_str(); }
51 |     
52 |     const std::string &Language() const { return language_; }
53 | 
54 |   private:
55 |     std::string language_;
56 |     std::string what_;
57 | };
58 | 
59 | /* Technically Flatten could be done without ICU but then it's only used in process_unicode that wants UnicodeString */
60 | class FlattenData;
61 | 
62 | class Flatten {
63 |   public:
64 |     explicit Flatten(const StringPiece &language);
65 | 
66 |     void Apply(const StringPiece &in, std::string &out) const;
67 |     void Apply(const U_ICU_NAMESPACE::UnicodeString &in, U_ICU_NAMESPACE::UnicodeString &out) const;
68 | 
69 |   private:
70 |     const FlattenData &data_;
71 | };
72 | 
73 | } // namespace util
74 | 
75 | #endif // UTIL_UTF8_ICU
76 | 


--------------------------------------------------------------------------------
/util/utf8_test.cc:
--------------------------------------------------------------------------------
  1 | #include "util/utf8.hh"
  2 | #include "util/utf8_icu.hh"
  3 | 
  4 | #define BOOST_TEST_MODULE UTF8Test
  5 | #include <boost/test/unit_test.hpp>
  6 | 
  7 | #define CHECK_LOWER(ref, from) { \
  8 |   std::string out; \
  9 |   ToLower(from, out); \
 10 |   BOOST_CHECK_EQUAL(ref, out); \
 11 | }
 12 | 
 13 | #define CHECK_NORMALIZE(ref, from) { \
 14 |   std::string out; \
 15 |   Normalize(from, out); \
 16 |   BOOST_CHECK_EQUAL(ref, out); \
 17 | }
 18 | 
 19 | #define CHECK_FLATTEN(ref, from, language) { \
 20 |   Flatten flat(language); \
 21 |   std::string out; \
 22 |   flat.Apply(from, out); \
 23 |   BOOST_CHECK_EQUAL(ref, out); \
 24 | }
 25 | 
 26 | namespace util {
 27 | namespace {
 28 | 
 29 | BOOST_AUTO_TEST_CASE(ASCII) {
 30 |   CHECK_LOWER("foo", "FOO");
 31 |   CHECK_LOWER("foobaz", "fooBAz");
 32 | }
 33 | 
 34 | BOOST_AUTO_TEST_CASE(Accents) {
 35 |   CHECK_LOWER("ôæðø", "ôÆÐØ");
 36 | }
 37 | 
 38 | BOOST_AUTO_TEST_CASE(Thorn) {
 39 |   CHECK_LOWER("þ", "Þ");
 40 | }
 41 | 
 42 | BOOST_AUTO_TEST_CASE(NormalizeASCII) {
 43 |   CHECK_NORMALIZE("foo", "foo");
 44 | }
 45 | 
 46 | // This is a valid letter in some languages
 47 | BOOST_AUTO_TEST_CASE(NormalizeAE) {
 48 |   CHECK_NORMALIZE("æ", "æ");
 49 | }
 50 | 
 51 | BOOST_AUTO_TEST_CASE(NormalizeFI) {
 52 |   CHECK_NORMALIZE("fi", "ﬁ");
 53 | }
 54 | 
 55 | BOOST_AUTO_TEST_CASE(NormalizeFive) {
 56 |   CHECK_NORMALIZE("5", "⁵");
 57 | }
 58 | 
 59 | BOOST_AUTO_TEST_CASE(FlattenEnglish) {
 60 |   CHECK_FLATTEN("\"foo bar\" '", "«foo bar» '", "en");
 61 | }
 62 | 
 63 | BOOST_AUTO_TEST_CASE(FlattenFrench) {
 64 |   CHECK_FLATTEN("«foo bar»", "``foo bar''", "fr");
 65 | }
 66 | 
 67 | BOOST_AUTO_TEST_CASE(FlattenBunch) {
 68 |   CHECK_FLATTEN("...oeAe\"'s ", "…œÆ''' s ", "en");
 69 | }
 70 | 
 71 | BOOST_AUTO_TEST_CASE(FlattenPossessive) {
 72 |   CHECK_FLATTEN("'s", "' s", "en");
 73 |   CHECK_FLATTEN("'s ", "' s ", "en");
 74 |   CHECK_FLATTEN("a's", "a' s", "en");
 75 |   CHECK_FLATTEN("a's ", "a' s ", "en");
 76 |   CHECK_FLATTEN("' sfoo", "' sfoo", "en");
 77 |   CHECK_FLATTEN("' sfoo ", "' sfoo ", "en");
 78 | }
 79 | 
 80 | BOOST_AUTO_TEST_CASE(FailLarge) {
 81 |   StringPiece large(0, 1ULL << 32);
 82 |   std::string out;
 83 |   BOOST_CHECK_THROW(ToLower(large, out), ICUStupidlyUses32BitIntegersException);
 84 | }
 85 | 
 86 | BOOST_AUTO_TEST_CASE(IsUTF8Test) {
 87 |   BOOST_CHECK(IsUTF8("…œÆ5ôÆÐØôæðø"));
 88 |   BOOST_CHECK(!IsUTF8("…œ\xaaÆ5œÆ5ôÆÐØôæðø"));
 89 | }
 90 | 
 91 | BOOST_AUTO_TEST_CASE(Iterator) {
 92 |   DecodeUTF8Range range("\ufeffﬁ«🤦a");
 93 |   DecodeUTF8Iterator i = range.begin();
 94 |   BOOST_CHECK(i != range.end());
 95 |   BOOST_CHECK(!range.end());
 96 |   BOOST_CHECK_EQUAL(0xfeff, *i++);
 97 |   BOOST_CHECK_EQUAL(0xFB01, *i++);
 98 |   BOOST_CHECK_EQUAL(0xAB, *i++);
 99 |   BOOST_CHECK_EQUAL(0x1F926, *i++);
100 |   BOOST_CHECK_EQUAL('a', *i++);
101 |   BOOST_CHECK(!i);
102 |   BOOST_CHECK(i == range.end());
103 | }
104 | 
105 | /* This has been tested but it uses > 2 GB virtual memory so isn't enabled by default. */
106 | /* BOOST_AUTO_TEST_CASE(LargeIsUTF8) {
107 |   const size_t kBufferSize = (1ULL << 32) + 30ULL;
108 |   std::vector<char> buffer(kBufferSize);
109 |   StringPiece big(&*buffer.begin(), kBufferSize);
110 |   BOOST_CHECK(IsUTF8(big));
111 |   buffer[0] = 129;
112 |   BOOST_CHECK(!IsUTF8(big));
113 |   buffer[0] = 0;
114 |   buffer[1ULL << 32] = 129;
115 |   BOOST_CHECK(!IsUTF8(big));
116 | }*/
117 | 
118 | 
119 | } // namespace
120 | } // namespace util
121 | 


--------------------------------------------------------------------------------