├── .github └── workflows │ └── ubuntu.yml ├── .gitignore ├── CMakeLists.txt ├── COPYING ├── COPYING.LESSER ├── FindICU.cmake ├── LICENSE ├── README.md ├── moses ├── CMakeLists.txt ├── ems │ └── support │ │ └── split-sentences.perl ├── share │ └── nonbreaking_prefixes │ │ ├── README.txt │ │ ├── nonbreaking_prefix.as │ │ ├── nonbreaking_prefix.bn │ │ ├── nonbreaking_prefix.ca │ │ ├── nonbreaking_prefix.cs │ │ ├── nonbreaking_prefix.de │ │ ├── nonbreaking_prefix.el │ │ ├── nonbreaking_prefix.en │ │ ├── nonbreaking_prefix.es │ │ ├── nonbreaking_prefix.et │ │ ├── nonbreaking_prefix.fi │ │ ├── nonbreaking_prefix.fr │ │ ├── nonbreaking_prefix.ga │ │ ├── nonbreaking_prefix.gu │ │ ├── nonbreaking_prefix.hi │ │ ├── nonbreaking_prefix.hu │ │ ├── nonbreaking_prefix.is │ │ ├── nonbreaking_prefix.it │ │ ├── nonbreaking_prefix.kn │ │ ├── nonbreaking_prefix.lt │ │ ├── nonbreaking_prefix.lv │ │ ├── nonbreaking_prefix.ml │ │ ├── nonbreaking_prefix.mni │ │ ├── nonbreaking_prefix.mr │ │ ├── nonbreaking_prefix.nl │ │ ├── nonbreaking_prefix.or │ │ ├── nonbreaking_prefix.pa │ │ ├── nonbreaking_prefix.pl │ │ ├── nonbreaking_prefix.pt │ │ ├── nonbreaking_prefix.ro │ │ ├── nonbreaking_prefix.ru │ │ ├── nonbreaking_prefix.sk │ │ ├── nonbreaking_prefix.sl │ │ ├── nonbreaking_prefix.sv │ │ ├── nonbreaking_prefix.ta │ │ ├── nonbreaking_prefix.te │ │ ├── nonbreaking_prefix.yue │ │ └── nonbreaking_prefix.zh └── tokenizer │ ├── deescape-special-chars.perl │ ├── detokenizer.perl │ ├── escape-special-chars.perl │ ├── lowercase.perl │ ├── normalize-punctuation.perl │ └── tokenizer.perl ├── preprocess ├── CMakeLists.txt ├── apply_case_main.cc ├── b64filter_main.cc ├── base64.cc ├── base64.hh ├── base64_number_main.cc ├── cache_main.cc ├── captive_child.cc ├── captive_child.hh ├── commoncrawl_dedupe_main.cc ├── dedupe_main.cc ├── docenc_main.cc ├── fields.cc ├── fields.hh ├── foldfilter_main.cc ├── gigaword_extract.sh ├── gigaword_unwrap_main.cc ├── heuristics.perl ├── idf_main.cc ├── mmhsum_main.cc ├── order_independent_hash_main.cc ├── parallel.hh ├── process_unicode_main.cc ├── remove_invalid_utf8_base64_main.cc ├── remove_invalid_utf8_main.cc ├── remove_long_lines_main.cc ├── resplit.sh ├── shard_main.cc ├── simple_cleaning_main.cc ├── substitute_main.cc ├── subtract_lines_main.cc ├── tests │ ├── cache │ │ ├── input │ │ ├── run.sh │ │ ├── space_expected │ │ └── space_ref.py │ ├── dedupe │ │ ├── columns │ │ ├── columns.out │ │ ├── expected │ │ ├── input │ │ ├── ref.py │ │ └── run.sh │ ├── foldfilter │ │ ├── fold10.expected │ │ ├── input │ │ └── run.sh │ ├── run.sh │ ├── shard │ │ ├── input │ │ └── run.sh │ └── vars ├── text.sh ├── train_case_main.cc ├── truecase_main.cc ├── unescape_html.perl ├── vocab_main.cc ├── warc.cc ├── warc.hh └── warc_parallel_main.cc └── util ├── CMakeLists.txt ├── buffered_stream.hh ├── cat_compressed_main.cc ├── compress.cc ├── compress.hh ├── compress_test.cc ├── double-conversion ├── CMakeLists.txt ├── Jamfile ├── LICENSE ├── bignum-dtoa.cc ├── bignum-dtoa.h ├── bignum.cc ├── bignum.h ├── cached-powers.cc ├── cached-powers.h ├── diy-fp.cc ├── diy-fp.h ├── double-conversion.cc ├── double-conversion.h ├── fast-dtoa.cc ├── fast-dtoa.h ├── fixed-dtoa.cc ├── fixed-dtoa.h ├── ieee.h ├── strtod.cc ├── strtod.h └── utils.h ├── ersatz_progress.cc ├── ersatz_progress.hh ├── exception.cc ├── exception.hh ├── fake_ostream.hh ├── file.cc ├── file.hh ├── file_piece.cc ├── file_piece.hh ├── file_piece_test.cc ├── file_stream.hh ├── fixed_array.hh ├── float_to_string.cc ├── float_to_string.hh ├── have.hh ├── integer_to_string.cc ├── integer_to_string.hh ├── integer_to_string_test.cc ├── mmap.cc ├── mmap.hh ├── murmur_hash.cc ├── murmur_hash.hh ├── mutable_vocab.cc ├── mutable_vocab.hh ├── mutable_vocab_test.cc ├── object_pool.hh ├── pcqueue.hh ├── pcqueue_test.cc ├── pool.cc ├── pool.hh ├── probing_hash_table.hh ├── probing_hash_table_test.cc ├── scoped.cc ├── scoped.hh ├── spaces.cc ├── spaces.hh ├── string_piece.cc ├── string_piece.hh ├── string_stream.hh ├── string_stream_test.cc ├── threaded_buffered_stream.hh ├── tokenize_piece.hh ├── tokenize_piece_test.cc ├── utf8.cc ├── utf8.hh ├── utf8_icu.cc ├── utf8_icu.hh └── utf8_test.cc /.github/workflows/ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: dependencies 16 | run: sudo apt-get install -y build-essential libboost-test-dev libboost-program-options-dev cmake zlib1g-dev libbz2-dev liblzma-dev libicu-dev 17 | - name: cmake 18 | run: | 19 | cmake -E make_directory build 20 | cd build 21 | cmake .. 22 | - name: Compile 23 | working-directory: build 24 | run: cmake --build . -j2 25 | - name: Unit Tests 26 | working-directory: build 27 | run: ctest -j2 28 | - name: Regression Tests 29 | run: preprocess/tests/run.sh 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | util/file_piece.cc.gz 3 | *.swp 4 | *.o 5 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12) 2 | 3 | # Define a single cmake project 4 | project(preprocess) 5 | 6 | #Set for FindICU.cmake 7 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}) 8 | set(CMAKE_CXX_STANDARD 11) 9 | 10 | # Compile all executables into bin/ 11 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) 12 | 13 | # Compile all libraries into lib/ 14 | set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) 15 | 16 | if (NOT CMAKE_BUILD_TYPE) 17 | set(CMAKE_BUILD_TYPE Release) 18 | endif() 19 | 20 | option(COMPILE_TESTS "Compile tests" OFF) 21 | if (COMPILE_TESTS) 22 | # Tell cmake that we want unit tests to be compiled 23 | include(CTest) 24 | enable_testing() 25 | endif() 26 | 27 | if(MSVC) 28 | set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /w34716") 29 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w34716") 30 | endif() 31 | 32 | set(BOOST_LIBS program_options) 33 | if (COMPILE_TESTS) 34 | set(BOOST_LIBS ${BOOST_LIBS} unit_test_framework) 35 | endif() 36 | 37 | # We need boost for now to do program_options. 38 | find_package(Boost 1.41.0 REQUIRED COMPONENTS ${BOOST_LIBS}) 39 | 40 | find_package(ICU COMPONENTS i18n uc data io) 41 | include(CMakeDependentOption) 42 | cmake_dependent_option(USE_ICU "Build programs that use ICU" ON ICU_FOUND OFF) 43 | 44 | # Define where include files live 45 | include_directories( 46 | ${PROJECT_SOURCE_DIR} 47 | ${Boost_INCLUDE_DIRS} 48 | ${ICU_INCLUDE_DIRS} 49 | ) 50 | 51 | # Process subdirectories 52 | add_subdirectory(util) 53 | add_subdirectory(preprocess) 54 | add_subdirectory(moses) 55 | 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Most of the code here is licensed under the LGPL. There are exceptions which have their own licenses, listed below. See comments in those files for more details. 2 | 3 | util/murmur_hash.cc is under the MIT license. 4 | util/string_piece.hh and util/string_piece.cc are Google code. 5 | util/file.cc contains a modified implementation of mkstemp under the LGPL. 6 | FindICU.cmake is under BSD-2 clause license. 7 | util/utf8.hh contains Google code under Apache-2.0. 8 | 9 | For the rest: 10 | 11 | preprocess is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU Lesser General Public License as published 13 | by the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | Avenue code is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU Lesser General Public License for more details. 20 | 21 | You should have received a copy of the GNU Lesser General Public License 22 | along with Avenue code. If not, see . 23 | -------------------------------------------------------------------------------- /moses/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | foreach(moses moses/tokenizer/normalize-punctuation.perl moses/tokenizer/escape-special-chars.perl moses/tokenizer/tokenizer.perl moses/tokenizer/lowercase.perl moses/tokenizer/detokenizer.perl moses/tokenizer/deescape-special-chars.perl moses/share/nonbreaking_prefixes/nonbreaking_prefix.ro moses/share/nonbreaking_prefixes/nonbreaking_prefix.sk moses/share/nonbreaking_prefixes/nonbreaking_prefix.it moses/share/nonbreaking_prefixes/nonbreaking_prefix.ru moses/share/nonbreaking_prefixes/nonbreaking_prefix.cs moses/share/nonbreaking_prefixes/nonbreaking_prefix.ca moses/share/nonbreaking_prefixes/nonbreaking_prefix.es moses/share/nonbreaking_prefixes/nonbreaking_prefix.is moses/share/nonbreaking_prefixes/README.txt moses/share/nonbreaking_prefixes/nonbreaking_prefix.pt moses/share/nonbreaking_prefixes/nonbreaking_prefix.sl moses/share/nonbreaking_prefixes/nonbreaking_prefix.pl moses/share/nonbreaking_prefixes/nonbreaking_prefix.nl moses/share/nonbreaking_prefixes/nonbreaking_prefix.sv moses/share/nonbreaking_prefixes/nonbreaking_prefix.el moses/share/nonbreaking_prefixes/nonbreaking_prefix.fr moses/share/nonbreaking_prefixes/nonbreaking_prefix.en moses/share/nonbreaking_prefixes/nonbreaking_prefix.de moses/ems/support/split-sentences.perl) 2 | configure_file(../${moses} ../${moses} COPYONLY) 3 | endforeach() 4 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.as: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Dr 5 | ড 6 | 7 | #others 8 | 9 | 10 | #phonetics 11 | # A 12 | এ 13 | # B 14 | বি 15 | # C 16 | সি 17 | # D 18 | ডি 19 | # E 20 | ই 21 | # F 22 | এফ 23 | # G 24 | জি 25 | # H 26 | এইচ 27 | # I 28 | আম 29 | # J 30 | জে 31 | # K 32 | কে 33 | # L 34 | এল 35 | # M 36 | এম 37 | # N 38 | এন 39 | # O 40 | হে 41 | # P 42 | পি 43 | # Q 44 | কিউ 45 | # R 46 | আর 47 | # S 48 | এস 49 | # T 50 | টি 51 | # U 52 | ইউ 53 | # V 54 | ভি 55 | # W 56 | ডব্লু 57 | # X 58 | এক্স 59 | # Y 60 | ওয়াই 61 | # Z 62 | জেড 63 | 64 | #consonants 65 | 66 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.bn: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Dr 5 | ড 6 | 7 | #others 8 | 9 | 10 | #phonetics 11 | # A 12 | এ 13 | # B 14 | বি 15 | # C 16 | সি 17 | # D 18 | ডি 19 | # E 20 | ই 21 | # F 22 | এফ 23 | # G 24 | জি 25 | # H 26 | এইচ 27 | # I 28 | আম 29 | # J 30 | জে 31 | # K 32 | কে 33 | # L 34 | এল 35 | # M 36 | এম 37 | # N 38 | এন 39 | # O 40 | হে 41 | # P 42 | পি 43 | # Q 44 | কিউ 45 | # R 46 | আর 47 | # S 48 | এস 49 | # T 50 | টি 51 | # U 52 | ইউ 53 | # V 54 | ভি 55 | # W 56 | ডব্লু 57 | # X 58 | এক্স 59 | # Y 60 | ওয়াই 61 | # Z 62 | জেড 63 | 64 | #consonants 65 | 66 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | # rupees 99 | Rs 100 | 101 | #Numbers only. These should only induce breaks when followed by a numeric sequence 102 | # add NUMERIC_ONLY after the word for this function 103 | #This case is mostly for the english "No." which can either be a sentence of its own, or 104 | #if followed by a number, a non-breaking prefix 105 | No #NUMERIC_ONLY# 106 | Nos 107 | Art #NUMERIC_ONLY# 108 | Nr 109 | pp #NUMERIC_ONLY# 110 | 111 | #month abbreviations 112 | Jan 113 | Feb 114 | Mar 115 | Apr 116 | #May is a full word 117 | Jun 118 | Jul 119 | Aug 120 | Sep 121 | Oct 122 | Nov 123 | Dec 124 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.et: -------------------------------------------------------------------------------- 1 | nonbreaking_prefix.fi -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | #a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ga: -------------------------------------------------------------------------------- 1 | 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | Á 29 | É 30 | Í 31 | Ó 32 | Ú 33 | 34 | Uacht 35 | Dr 36 | B.Arch 37 | 38 | m.sh 39 | .i 40 | Co 41 | Cf 42 | cf 43 | i.e 44 | r 45 | Chr 46 | lch #NUMERIC_ONLY# 47 | lgh #NUMERIC_ONLY# 48 | uimh #NUMERIC_ONLY# 49 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.gu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Rs 5 | રૂ 6 | # Dr 7 | ડો 8 | # Dr 9 | ડૉ 10 | # Mr 11 | શ્રી 12 | 13 | #others 14 | 15 | 16 | #phonetics 17 | # A 18 | એ 19 | # B 20 | બી 21 | # C 22 | સી 23 | # D 24 | ડી 25 | # E 26 | ઇ 27 | # F 28 | એફ 29 | # G 30 | જી 31 | # H 32 | એચ 33 | # I 34 | આઈ 35 | # J 36 | જે 37 | # K 38 | કે 39 | # L 40 | એલ 41 | # M 42 | એમ 43 | # N 44 | એન 45 | # O 46 | ઓ 47 | # P 48 | પી 49 | # Q 50 | ક્યૂ 51 | # R 52 | આર 53 | # S 54 | એસ 55 | # T 56 | ટી 57 | # U 58 | યુ 59 | # V 60 | વી 61 | # W 62 | ડબલ્યુ 63 | # X 64 | એક્સ 65 | # Y 66 | વાય 67 | # Z 68 | ઝેડ 69 | 70 | #consonants 71 | ક 72 | ખ 73 | ગ 74 | ઘ 75 | ઙ 76 | ચ 77 | છ 78 | જ 79 | ઝ 80 | ઞ 81 | ટ 82 | ઠ 83 | ડ 84 | ઢ 85 | ણ 86 | ત 87 | થ 88 | દ 89 | ધ 90 | ન 91 | પ 92 | ફ 93 | બ 94 | ભ 95 | મ 96 | ય 97 | ર 98 | લ 99 | ળ 100 | વ 101 | શ 102 | ષ 103 | સ 104 | હ 105 | 106 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.hi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Rs 5 | रु 6 | # Dr 7 | डॉ 8 | # Dr 9 | डा 10 | # Mr 11 | श्री 12 | 13 | #others 14 | टीवी 15 | 16 | #phonetics 17 | # A 18 | ए 19 | ऐ 20 | # B 21 | बी 22 | # C 23 | सी 24 | # D 25 | डी 26 | # E 27 | ई 28 | # F 29 | ऐफ 30 | एफ 31 | # G 32 | जी 33 | # H 34 | ऐच 35 | एच 36 | # I 37 | आइ 38 | # J 39 | जे 40 | # K 41 | के 42 | # L 43 | ऐल 44 | एल 45 | # M 46 | ऐम 47 | एम 48 | # N 49 | ऐन 50 | एन 51 | # O 52 | ओ 53 | # P 54 | पी 55 | # Q 56 | क्यू 57 | # R 58 | आर 59 | # S 60 | ऐस 61 | एस 62 | # T 63 | टी 64 | # U 65 | यू 66 | # V 67 | वी 68 | # W 69 | डब्ल्यू 70 | # X 71 | ऐक्स 72 | एक्स 73 | # Y 74 | वाय 75 | वाई 76 | # Z 77 | ज़ैड 78 | 79 | #consonants 80 | क 81 | ख 82 | ग 83 | घ 84 | ङ 85 | च 86 | छ 87 | ज 88 | झ 89 | ञ 90 | ट 91 | ठ 92 | ड 93 | ढ 94 | ण 95 | त 96 | थ 97 | द 98 | ध 99 | न 100 | प 101 | फ 102 | ब 103 | भ 104 | म 105 | य 106 | र 107 | ल 108 | व 109 | श 110 | ष 111 | स 112 | ह 113 | 114 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.kn: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Rs 5 | ರೂ 6 | # Dr 7 | ಡಾ 8 | # Mr 9 | ಶ್ರೀ 10 | 11 | #others 12 | 13 | 14 | #phonetics 15 | # A 16 | ಎ 17 | # B 18 | ಬಿ 19 | # C 20 | ಸಿ 21 | # D 22 | ಡಿ 23 | # E 24 | ಇ 25 | # F 26 | ಎಫ್ 27 | # G 28 | ಜಿ 29 | # H 30 | ಹೆಚ್ 31 | ಎಚ್‌ 32 | # I 33 | ಐ 34 | # J 35 | ಜೆ 36 | # K 37 | ಕೆ 38 | # L 39 | ಎಲ್ 40 | # M 41 | ಎಂ 42 | # N 43 | ಎನ್ 44 | # O 45 | ಒ 46 | # P 47 | ಪಿ 48 | # Q 49 | ಕ್ಯೂ 50 | # R 51 | ಆರ್ 52 | # S 53 | ಎಸ್ 54 | # T 55 | ಟಿ 56 | # U 57 | ಯು 58 | # V 59 | ವಿ 60 | # W 61 | ಡಬ್ಲ್ಯೂ 62 | # X 63 | ಎಕ್ಸ್ 64 | # Y 65 | ವೈ 66 | # Z 67 | ಜೆಡ್ 68 | 69 | #consonants 70 | 71 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ml: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Dr 5 | ഡോ 6 | # Mr 7 | ശ്രീ 8 | 9 | #others 10 | 11 | 12 | #phonetics 13 | # A 14 | എ 15 | # B 16 | ബി 17 | # C 18 | സി 19 | # D 20 | ഡി 21 | # E 22 | ഇ 23 | # F 24 | എഫ് 25 | # G 26 | ജി 27 | # H 28 | എച്ച് 29 | # I 30 | ഐ 31 | # J 32 | ജെ 33 | # K 34 | കെ 35 | # L 36 | എൽ 37 | # M 38 | എം 39 | # N 40 | എൻ 41 | # O 42 | ഒ 43 | # P 44 | പി 45 | # Q 46 | ക്യൂ 47 | # R 48 | ആർ 49 | # S 50 | എസ് 51 | # T 52 | ടി 53 | # U 54 | യു 55 | # V 56 | വി 57 | # W 58 | ഡബ്ല്യു 59 | # X 60 | എക്സ് 61 | # Y 62 | വൈ 63 | # Z 64 | സെഡ് 65 | 66 | #consonants 67 | 68 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.mni: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Dr 5 | দা 6 | 7 | #others 8 | 9 | 10 | #phonetics 11 | # A 12 | এ 13 | # B 14 | বি 15 | # C 16 | সি 17 | # D 18 | ডি 19 | # E 20 | ই 21 | # F 22 | এফ 23 | # G 24 | জি 25 | # H 26 | এইচ 27 | # I 28 | আম 29 | # J 30 | জে 31 | # K 32 | কে 33 | # L 34 | এল 35 | # M 36 | এম 37 | # N 38 | এন 39 | # O 40 | হে 41 | # P 42 | পি 43 | # Q 44 | কিউ 45 | # R 46 | আর 47 | # S 48 | এস 49 | # T 50 | টি 51 | # U 52 | ইউ 53 | # V 54 | ভি 55 | # W 56 | ডব্লু 57 | # X 58 | এক্স 59 | # Y 60 | ওয়াই 61 | # Z 62 | জেড 63 | 64 | #consonants 65 | 66 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.mr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Rs 5 | रु 6 | # Dr 7 | डॉ 8 | # Dr 9 | डा 10 | # Mr 11 | श्री 12 | 13 | #others 14 | 15 | 16 | #phonetics 17 | # A 18 | ए 19 | ऐ 20 | # B 21 | बी 22 | # C 23 | सी 24 | # D 25 | डी 26 | # E 27 | ई 28 | # F 29 | ऐफ 30 | एफ 31 | # G 32 | जी 33 | # H 34 | ऐच 35 | एच 36 | # I 37 | आइ 38 | # J 39 | जे 40 | # K 41 | के 42 | # L 43 | ऐल 44 | एल 45 | # M 46 | ऐम 47 | एम 48 | # N 49 | ऐन 50 | एन 51 | # O 52 | ओ 53 | # P 54 | पी 55 | # Q 56 | क्यू 57 | # R 58 | आर 59 | # S 60 | ऐस 61 | एस 62 | # T 63 | टी 64 | # U 65 | यू 66 | # V 67 | वी 68 | # W 69 | डब्ल्यू 70 | # X 71 | ऐक्स 72 | एक्स 73 | # Y 74 | वाय 75 | वाई 76 | # Z 77 | ज़ैड 78 | 79 | #consonants 80 | क 81 | ख 82 | ग 83 | घ 84 | ङ 85 | च 86 | छ 87 | ज 88 | झ 89 | ञ 90 | ट 91 | ठ 92 | ड 93 | ढ 94 | ण 95 | त 96 | थ 97 | द 98 | ध 99 | न 100 | प 101 | फ 102 | ब 103 | भ 104 | म 105 | य 106 | र 107 | ल 108 | व 109 | श 110 | ष 111 | स 112 | ह 113 | 114 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.or: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Mr 5 | ରୀ 6 | 7 | #others 8 | 9 | 10 | #phonetics 11 | # A 12 | 13 | # B 14 | 15 | # C 16 | 17 | # D 18 | 19 | # E 20 | 21 | # F 22 | 23 | # G 24 | 25 | # H 26 | 27 | # I 28 | 29 | # J 30 | 31 | # K 32 | 33 | # L 34 | 35 | # M 36 | 37 | # N 38 | 39 | # O 40 | 41 | # P 42 | 43 | # Q 44 | 45 | # R 46 | 47 | # S 48 | 49 | # T 50 | 51 | # U 52 | 53 | # V 54 | 55 | # W 56 | 57 | # X 58 | 59 | # Y 60 | 61 | # Z 62 | 63 | 64 | #consonants 65 | କ 66 | ଖ 67 | ଗ 68 | ଘ 69 | ଙ 70 | ଚ 71 | ଛ 72 | ଜ 73 | ଝ 74 | ଞ 75 | ଟ 76 | ଠ 77 | ଡ 78 | ଢ 79 | ଣ 80 | ତ 81 | ଥ 82 | ଦ 83 | ଧ 84 | ନ 85 | ପ 86 | ଫ 87 | ବ 88 | ଵ 89 | ଭ 90 | ମ 91 | ଯ 92 | ୟ 93 | ର 94 | ଲ 95 | ଳ 96 | ୱ 97 | ଶ 98 | ଷ 99 | ସ 100 | ହ 101 | 102 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.pa: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Dr 5 | ਡਾ 6 | # Dr 7 | ਪ੍ਰੋ 8 | # Mr 9 | ਸ੍ਰੀ 10 | 11 | #others 12 | 13 | 14 | #phonetics 15 | # A 16 | ਏ 17 | # B 18 | ਬੀ 19 | # C 20 | ਸੀ 21 | # D 22 | ਡੀ 23 | # E 24 | ਈ 25 | # F 26 | ਐੱਫ 27 | # G 28 | ਜੀ 29 | # H 30 | ਐਚ 31 | # I 32 | ਆਈ 33 | # J 34 | ਜੇ 35 | # K 36 | ਕੇ 37 | # L 38 | ਐਲ 39 | # M 40 | ਐੱਮ 41 | # N 42 | ਐੱਨ 43 | # O 44 | ਓ 45 | # P 46 | ਪੀ 47 | # Q 48 | ਕੀਓ 49 | # R 50 | ਆਰ 51 | # S 52 | ਐੱਸ 53 | ਸ 54 | # T 55 | ਟੀ 56 | # U 57 | ਯੂ 58 | # V 59 | ਵੀ 60 | # W 61 | ਡਬਲਿਊ 62 | # X 63 | ਐਕ੍ਸ 64 | # Y 65 | ਵਾਈ 66 | # Z 67 | ਜ਼ੈਡ 68 | 69 | #consonants 70 | ਕ 71 | ਖ 72 | ਗ 73 | ਘ 74 | ਙ 75 | ਚ 76 | ਛ 77 | ਜ 78 | ਝ 79 | ਞ 80 | ਟ 81 | ਠ 82 | ਡ 83 | ਢ 84 | ਣ 85 | ਤ 86 | ਥ 87 | ਦ 88 | ਧ 89 | ਨ 90 | ਪ 91 | ਫ 92 | ਬ 93 | ਭ 94 | ਮ 95 | ਯ 96 | ਰ 97 | ਲ 98 | ਵ 99 | ੜ 100 | ਸ 101 | ਹ 102 | 103 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | Å 29 | Ä 30 | Ö 31 | #misc abbreviations 32 | #If all words in text are in small case, then tex, mao, tom, maj, may be confused with names, and iaf, etc with named entities. 33 | AB 34 | VG 35 | dvs 36 | d.v.s 37 | d. v. s 38 | etc 39 | from 40 | fr.o.m 41 | fr. o. m 42 | iaf 43 | i.a.f 44 | i. a. f 45 | jfr 46 | kl 47 | kr 48 | mao 49 | m.a.o 50 | m. a. o 51 | mfl 52 | m.fl 53 | m. fl 54 | mm 55 | m.m 56 | m. m. 57 | osv 58 | o.s.v 59 | o. s. v 60 | pga 61 | p.g.a 62 | p. g. a 63 | tex 64 | t.ex 65 | t. ex 66 | #tom. is risky, as tom is a word, and can be at end of sentence. One recent text has 9 tom., and 52 tom not at end of sentence. 67 | tom 68 | t.o.m 69 | t. o. m 70 | vs 71 | adv 72 | jur 73 | kand 74 | mag 75 | fil 76 | lic 77 | prop 78 | d 79 | f 80 | s 81 | mha 82 | m.h.a 83 | m. h. a 84 | vol 85 | #months 86 | jan 87 | feb 88 | mar 89 | apr 90 | #maj is a full word 91 | jun 92 | jul 93 | aug 94 | sep 95 | okt 96 | nov 97 | dec 98 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Rs 5 | ர 6 | # Rs 7 | ூ 8 | # Mr 9 | திரு 10 | 11 | #others 12 | 13 | 14 | #phonetics 15 | # A 16 | ஏ 17 | # B 18 | பீ 19 | # C 20 | சீ 21 | # D 22 | டீ 23 | # E 24 | ஈ 25 | # F 26 | எஃப் 27 | # G 28 | ஜீ 29 | # H 30 | எச் 31 | ஹெச் 32 | # I 33 | ஐ 34 | # J 35 | ஜே 36 | ஜை 37 | # K 38 | கே 39 | # L 40 | எல் 41 | # M 42 | எம் 43 | # N 44 | என் 45 | # O 46 | ஓ 47 | # P 48 | ப்பீ 49 | # Q 50 | கியூ 51 | # R 52 | ஆர் 53 | # S 54 | எஸ் 55 | # T 56 | ட்டீ 57 | # U 58 | யூ 59 | # V 60 | வீ 61 | # W 62 | டபிள்-யூ 63 | # X 64 | எக்ஸ் 65 | # Y 66 | வை 67 | # Z 68 | செட் 69 | 70 | #consonants 71 | 72 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.te: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | 3 | #common exceptions 4 | # Rs 5 | ర 6 | # Rs 7 | ూ 8 | # Mr 9 | శ్రీ 10 | 11 | #others 12 | 13 | 14 | #phonetics 15 | # A 16 | ఎ 17 | # B 18 | బి 19 | # C 20 | సి 21 | # D 22 | డి 23 | # E 24 | ఇ 25 | # F 26 | ఎఫ్ 27 | # G 28 | జి 29 | # H 30 | హెచ్‌ 31 | # I 32 | ఐ 33 | # J 34 | జె 35 | # K 36 | కె 37 | # L 38 | ఎల్ 39 | # M 40 | ఎం 41 | ఎమ్ 42 | # N 43 | ఎన్ 44 | # O 45 | ఓ 46 | # P 47 | పి 48 | # Q 49 | క్యూ 50 | # R 51 | ఆర్ 52 | # S 53 | ఎస్ 54 | # T 55 | టి 56 | # U 57 | యు 58 | # V 59 | వి 60 | # W 61 | డబ్ల్యూ 62 | # X 63 | ఎక్స్ 64 | # Y 65 | వై 66 | # Z 67 | జెడ్ 68 | 69 | #consonants 70 | 71 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.yue: -------------------------------------------------------------------------------- 1 | # 2 | # Cantonese (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.zh: -------------------------------------------------------------------------------- 1 | # 2 | # Mandarin (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /moses/tokenizer/deescape-special-chars.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | while() { 10 | s/\&bar;/\|/g; # factor separator (legacy) 11 | s/\|/\|/g; # factor separator 12 | s/\</\/g; # xml 14 | s/\&bra;/\[/g; # syntax non-terminal (legacy) 15 | s/\&ket;/\]/g; # syntax non-terminal (legacy) 16 | s/\"/\"/g; # xml 17 | s/\'/\'/g; # xml 18 | s/\[/\[/g; # syntax non-terminal 19 | s/\]/\]/g; # syntax non-terminal 20 | s/\&/\&/g; # escape escape 21 | print $_; 22 | } 23 | -------------------------------------------------------------------------------- /moses/tokenizer/escape-special-chars.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | while() { 10 | chop; 11 | 12 | # avoid general madness 13 | s/[\000-\037]//g; 14 | s/\s+/ /g; 15 | s/^ //g; 16 | s/ $//g; 17 | 18 | # special characters in moses 19 | s/\&/\&/g; # escape escape 20 | s/\|/\|/g; # factor separator 21 | s/\/\>/g; # xml 23 | s/\'/\'/g; # xml 24 | s/\"/\"/g; # xml 25 | s/\[/\[/g; # syntax non-terminal 26 | s/\]/\]/g; # syntax non-terminal 27 | 28 | # restore xml instructions 29 | s/\<(\S+) translation="(.+?)"> (.+?) <\/(\S+)>/\<$1 translation=\"$2\"> $3 <\/$4>/g; 30 | print $_."\n"; 31 | } 32 | -------------------------------------------------------------------------------- /moses/tokenizer/lowercase.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | binmode(STDIN, ":utf8"); 10 | binmode(STDOUT, ":utf8"); 11 | 12 | while() { 13 | print lc($_); 14 | } 15 | -------------------------------------------------------------------------------- /moses/tokenizer/normalize-punctuation.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | my $language = "en"; 10 | my $PENN = 0; 11 | 12 | while (@ARGV) { 13 | $_ = shift; 14 | /^-b$/ && ($| = 1, next); # not buffered (flush each line) 15 | /^-l$/ && ($language = shift, next); 16 | /^[^\-]/ && ($language = $_, next); 17 | /^-penn$/ && ($PENN = 1, next); 18 | } 19 | 20 | while() { 21 | s/\r//g; 22 | # remove extra spaces 23 | s/\(/ \(/g; 24 | s/\)/\) /g; s/ +/ /g; 25 | s/\) ([\.\!\:\?\;\,])/\)$1/g; 26 | s/\( /\(/g; 27 | s/ \)/\)/g; 28 | s/(\d) \%/$1\%/g; 29 | s/ :/:/g; 30 | s/ ;/;/g; 31 | # normalize unicode punctuation 32 | if ($PENN == 0) { 33 | s/\`/\'/g; 34 | s/\'\'/ \" /g; 35 | } 36 | 37 | s/„/\"/g; 38 | s/“/\"/g; 39 | s/”/\"/g; 40 | s/–/-/g; 41 | s/—/ - /g; s/ +/ /g; 42 | s/´/\'/g; 43 | s/([a-z])‘([a-z])/$1\'$2/gi; 44 | s/([a-z])’([a-z])/$1\'$2/gi; 45 | s/‘/\"/g; 46 | s/‚/\"/g; 47 | s/’/\"/g; 48 | s/''/\"/g; 49 | s/´´/\"/g; 50 | s/…/.../g; 51 | # French quotes 52 | s/ « / \"/g; 53 | s/« /\"/g; 54 | s/«/\"/g; 55 | s/ » /\" /g; 56 | s/ »/\"/g; 57 | s/»/\"/g; 58 | # handle pseudo-spaces 59 | s/ \%/\%/g; 60 | s/nº /nº /g; 61 | s/ :/:/g; 62 | s/ ºC/ ºC/g; 63 | s/ cm/ cm/g; 64 | s/ \?/\?/g; 65 | s/ \!/\!/g; 66 | s/ ;/;/g; 67 | s/, /, /g; s/ +/ /g; 68 | 69 | # English "quotation," followed by comma, style 70 | if ($language eq "en") { 71 | s/\"([,\.]+)/$1\"/g; 72 | } 73 | # Czech is confused 74 | elsif ($language eq "cs" || $language eq "cz") { 75 | } 76 | # German/Spanish/French "quotation", followed by comma, style 77 | else { 78 | s/,\"/\",/g; 79 | s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence 80 | } 81 | 82 | 83 | if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { 84 | s/(\d) (\d)/$1,$2/g; 85 | } 86 | else { 87 | s/(\d) (\d)/$1.$2/g; 88 | } 89 | print $_; 90 | } 91 | -------------------------------------------------------------------------------- /preprocess/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (NOT MSVC) 2 | set(THREADS pthread) 3 | endif() 4 | 5 | add_library(fields STATIC fields.cc) 6 | add_library(captive_child STATIC captive_child.cc) 7 | add_library(warc STATIC warc.cc) 8 | add_library(base64 STATIC base64.cc) 9 | 10 | # Explicitly list the executable files to be compiled 11 | set(EXE_LIST 12 | b64filter 13 | base64_number 14 | cache 15 | commoncrawl_dedupe 16 | dedupe 17 | docenc 18 | foldfilter 19 | gigaword_unwrap 20 | idf 21 | mmhsum 22 | order_independent_hash 23 | remove_invalid_utf8 24 | remove_invalid_utf8_base64 25 | remove_long_lines 26 | shard 27 | substitute 28 | subtract_lines 29 | vocab 30 | warc_parallel 31 | ) 32 | 33 | set(ICU_EXE_LIST 34 | apply_case 35 | truecase 36 | train_case 37 | process_unicode 38 | simple_cleaning 39 | ) 40 | if(USE_ICU) 41 | set(EXE_LIST ${EXE_LIST} ${ICU_EXE_LIST}) 42 | endif(USE_ICU) 43 | 44 | set(PREPROCESS_LIBS preprocess_util ${Boost_LIBRARIES} ${THREADS}) 45 | 46 | foreach(exe ${EXE_LIST}) 47 | add_executable(${exe} ${exe}_main.cc) 48 | target_link_libraries(${exe} ${PREPROCESS_LIBS}) 49 | set_target_properties(${exe} PROPERTIES FOLDER executables) 50 | endforeach(exe) 51 | 52 | target_link_libraries(b64filter ${PREPROCESS_LIBS} base64 captive_child) 53 | target_link_libraries(base64_number ${PREPROCESS_LIBS} base64 captive_child) 54 | target_link_libraries(cache ${PREPROCESS_LIBS} fields captive_child) 55 | target_link_libraries(dedupe ${PREPROCESS_LIBS} fields) 56 | target_link_libraries(docenc ${PREPROCESS_LIBS} base64) 57 | target_link_libraries(foldfilter ${PREPROCESS_LIBS} captive_child) 58 | target_link_libraries(remove_invalid_utf8_base64 ${PREPROCESS_LIBS} base64) 59 | target_link_libraries(shard ${PREPROCESS_LIBS} fields) 60 | target_link_libraries(simple_cleaning ${PREPROCESS_LIBS} fields) 61 | target_link_libraries(substitute ${PREPROCESS_LIBS} fields) 62 | target_link_libraries(warc_parallel ${PREPROCESS_LIBS} warc captive_child) 63 | 64 | if(USE_ICU) 65 | foreach(exe ${ICU_EXE_LIST}) 66 | target_link_libraries(${exe} preprocess_icu) 67 | endforeach(exe) 68 | endif(USE_ICU) 69 | 70 | foreach(script text.sh gigaword_extract.sh resplit.sh unescape_html.perl heuristics.perl) 71 | configure_file(${script} ../bin/${script} COPYONLY) 72 | endforeach() 73 | -------------------------------------------------------------------------------- /preprocess/apply_case_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_stream.hh" 2 | #include "util/file_piece.hh" 3 | #include "util/murmur_hash.hh" 4 | #include "util/mutable_vocab.hh" 5 | #include "util/tokenize_piece.hh" 6 | #include "util/utf8.hh" 7 | #include "util/utf8_icu.hh" 8 | 9 | #include 10 | 11 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE 12 | #include 13 | 14 | namespace { 15 | void SplitLine(util::FilePiece &from, std::vector &to) { 16 | to.clear(); 17 | for (util::TokenIter i(from.ReadLine(), ' '); i; ++i) { 18 | to.push_back(*i); 19 | } 20 | } 21 | 22 | bool SameLine(util::FilePiece &f) { 23 | while (true) { 24 | switch(f.peek()) { 25 | case '\n': 26 | f.get(); 27 | return false; 28 | case ' ': 29 | case '\t': 30 | f.get(); 31 | continue; 32 | default: 33 | return true; 34 | } 35 | } 36 | } 37 | } // namespace 38 | 39 | int main(int argc, char *argv[]) { 40 | if (argc != 5) { 41 | std::cerr << argv[0] << " alignment source target model" << std::endl; 42 | return 1; 43 | } 44 | util::FilePiece align(argv[1]), source_file(argv[2]), target_file(argv[3]), model(argv[4]); 45 | 46 | util::MutableVocab vocab; 47 | std::unordered_map best; 48 | while (true) { 49 | uint64_t key; 50 | try { 51 | key = model.ReadULong(); 52 | } catch (const util::EndOfFileException &e) { break; } 53 | uint64_t max_count = 0; 54 | util::StringPiece best_word; 55 | for (util::TokenIter pair(model.ReadLine(), '\t'); pair; ++pair) { 56 | util::TokenIter spaces(*pair, ' '); 57 | util::StringPiece word(*spaces); 58 | uint64_t count = boost::lexical_cast(*++spaces); 59 | if (count > max_count) { 60 | max_count = count; 61 | best_word = word; 62 | } 63 | best[key] = vocab.FindOrInsert(best_word); 64 | } 65 | } 66 | 67 | std::cerr << "Read model." << std::endl; 68 | 69 | std::vector source_words, target_words; 70 | std::string lowered; 71 | util::FileStream out(1); 72 | for (std::size_t line = 0; ; ++line) { 73 | try { 74 | SplitLine(source_file, source_words); 75 | } catch (const util::EndOfFileException &e) { break; } 76 | SplitLine(target_file, target_words); 77 | align.ReadULong(); 78 | UTIL_THROW_IF2("|||" != align.ReadDelimited(), "Expected |||"); 79 | while (SameLine(align)) { 80 | unsigned long first = align.ReadULong(); 81 | UTIL_THROW_IF2(align.get() != '-', "Bad alignment"); 82 | UTIL_THROW_IF2(align.peek() < '0' || align.peek() > '9', "Expected number for alignment, not " << align.peek()); 83 | unsigned long second = align.ReadULong(); 84 | UTIL_THROW_IF2(first >= source_words.size(), "Index " << first << " too high for source text at line " << line << " which has size " << source_words.size()); 85 | UTIL_THROW_IF2(second >= target_words.size(), "Index " << second << " too high for target text at line " << line << " which has size " << target_words.size()); 86 | util::ToLower(target_words[second], lowered); 87 | util::StringPiece source(source_words[first]); 88 | uint64_t key = util::MurmurHash64A(lowered.data(), lowered.size(), util::MurmurHash64A(source.data(), source.size())); 89 | std::unordered_map::const_iterator found = best.find(key); 90 | if (found != best.end()) { 91 | target_words[second] = vocab.String(found->second); 92 | } 93 | } 94 | std::vector::const_iterator i = target_words.begin(); 95 | if (i != target_words.end()) out << *i; 96 | for (++i; i != target_words.end(); ++i) { 97 | out << ' ' << *i; 98 | } 99 | out << '\n'; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /preprocess/b64filter_main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "preprocess/base64.hh" 4 | #include "preprocess/captive_child.hh" 5 | #include "util/exception.hh" 6 | #include "util/file_stream.hh" 7 | #include "util/file_piece.hh" 8 | #include "util/pcqueue.hh" 9 | 10 | 11 | namespace { 12 | 13 | struct Document { 14 | size_t line_cnt; 15 | bool has_trailing_newline; 16 | }; 17 | 18 | } // namespace 19 | 20 | int main(int argc, char **argv) { 21 | if (argc < 2) { 22 | std::cerr << "usage: " << argv[0] << " command [command-args...]\n"; 23 | return 1; 24 | } 25 | 26 | util::UnboundedSingleQueue line_cnt_queue; 27 | 28 | util::scoped_fd child_in_fd, child_out_fd; 29 | 30 | pid_t child = preprocess::Launch(argv + 1, child_in_fd, child_out_fd); 31 | 32 | std::thread feeder([&child_in_fd, &line_cnt_queue]() { 33 | util::FilePiece in(STDIN_FILENO); 34 | util::FileStream child_in(child_in_fd.release()); 35 | 36 | // Decoded document buffer 37 | std::string doc; 38 | 39 | for (util::StringPiece line : in) { 40 | preprocess::base64_decode(line, doc); 41 | 42 | // Description of the document 43 | Document doc_desc{ 44 | .line_cnt = 0, 45 | .has_trailing_newline = doc.back() == '\n', 46 | }; 47 | 48 | // Make the the document end with a new line. This to make sure 49 | // the next doc we send to the child will be on its own line and the 50 | // line_cnt is correct. 51 | if (!doc_desc.has_trailing_newline) 52 | doc.push_back('\n'); 53 | 54 | doc_desc.line_cnt = count(doc.cbegin(), doc.cend(), '\n'); 55 | 56 | // Send line count first to the reader, so it can start reading as 57 | // soon as we start feeding the document to the child. 58 | line_cnt_queue.Produce(std::move(doc_desc)); 59 | 60 | // Feed the document to the child. 61 | // Might block because it can cause a flush. 62 | child_in << doc; 63 | } 64 | 65 | // Tell the reader to stop 66 | line_cnt_queue.Produce(Document{ 67 | .line_cnt = 0, 68 | .has_trailing_newline = false 69 | }); 70 | 71 | // Flush (blocks). The FileStream destructor closes. 72 | child_in.flush(); 73 | }); 74 | 75 | std::thread reader([&child_out_fd, &line_cnt_queue]() { 76 | util::FileStream out(STDOUT_FILENO); 77 | util::FilePiece child_out(child_out_fd.release()); 78 | 79 | size_t doc_cnt = 0; 80 | Document document; 81 | std::string doc; 82 | 83 | while (line_cnt_queue.Consume(document).line_cnt > 0) { 84 | ++doc_cnt; 85 | 86 | doc.clear(); 87 | doc.reserve(document.line_cnt * 4096); // 4096 is not a typical line length 88 | 89 | try { 90 | while (document.line_cnt-- > 0) { 91 | util::StringPiece line(child_out.ReadLine()); 92 | doc.append(line.data(), line.length()); 93 | 94 | // ReadLine eats line endings. Between lines we definitely 95 | // need to add them back. Whether we add the last one depends 96 | // on whether the original document had a trailing newline. 97 | if (document.line_cnt > 0 || document.has_trailing_newline) 98 | doc.push_back('\n'); 99 | } 100 | } catch (util::EndOfFileException &e) { 101 | UTIL_THROW(util::Exception, "Sub-process stopped producing while expecting more lines while processing document " << doc_cnt); 102 | } 103 | 104 | std::string encoded_doc; 105 | preprocess::base64_encode(doc, encoded_doc); 106 | out << encoded_doc << '\n'; 107 | } 108 | 109 | // Assert that we have consumed all the output of the child program. 110 | try { 111 | // peek() should now fail on an end of file, the loop above should 112 | // already have consumed all output that's there. 113 | child_out.peek(); 114 | 115 | UTIL_THROW(util::Exception, "sub-process is producing more output than it was given input"); 116 | } catch (util::EndOfFileException &e) { 117 | // Good! 118 | } 119 | }); 120 | 121 | int retval = preprocess::Wait(child); 122 | 123 | feeder.join(); 124 | reader.join(); 125 | 126 | return retval; 127 | } 128 | -------------------------------------------------------------------------------- /preprocess/base64.cc: -------------------------------------------------------------------------------- 1 | #include "base64.hh" 2 | #include 3 | #include 4 | #include "util/exception.hh" 5 | 6 | namespace preprocess { 7 | 8 | namespace { 9 | 10 | char const *TABLE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 11 | 12 | int const INV_TABLE[256] = { 13 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 16 | 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, 17 | -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18 | 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 19 | -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 20 | 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1 21 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 23 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 24 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 27 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 28 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 29 | }; 30 | 31 | size_t count_padding(const util::StringPiece &in) { 32 | const char *data = in.data(); 33 | 34 | for (int32_t i = 1; i <= in.size(); ++i) 35 | if (data[in.size() - i] != '=') 36 | return i - 1; 37 | 38 | return in.size(); 39 | } 40 | 41 | } // namespace 42 | 43 | void base64_encode(const util::StringPiece &in, std::string &out) { 44 | out.clear(); 45 | out.reserve(4 * ((in.size() + 2) / 3)); 46 | 47 | int val = 0, valb = -6; 48 | 49 | for (const unsigned char *c = reinterpret_cast(in.data()); c != reinterpret_cast(in.data()) + in.size(); ++c) { 50 | val = (val << 8) + *c; 51 | valb += 8; 52 | while (valb >= 0) { 53 | out.push_back(TABLE[(val >> valb) & 0x3F]); 54 | valb -= 6; 55 | } 56 | } 57 | 58 | if (valb >- 6) 59 | out.push_back(TABLE[((val << 8) >> (valb + 8)) & 0x3F]); 60 | 61 | while (out.size() % 4) 62 | out.push_back('='); 63 | } 64 | 65 | void base64_decode(const util::StringPiece &in, std::string &out) { 66 | out.clear(); 67 | 68 | // Reserve worst case scenario memory 69 | out.reserve(in.size() * 3 / 4 - count_padding(in)); 70 | 71 | int val = 0, valb = -8; 72 | for (const unsigned char *c = reinterpret_cast(in.data()); c != reinterpret_cast(in.data()) + in.size(); ++c) { 73 | // Padding reached 74 | if (*c == '=') 75 | break; 76 | 77 | UTIL_THROW_IF(INV_TABLE[*c] == -1, util::Exception, "Cannot interpret character '" << *c << "' as part of base64"); 78 | 79 | val = (val << 6) + INV_TABLE[*c]; 80 | valb += 6; 81 | if (valb >= 0) { 82 | out.push_back(char((val >> valb) & 0xFF)); 83 | valb -= 8; 84 | } 85 | } 86 | } 87 | 88 | } // namespace preprocess 89 | -------------------------------------------------------------------------------- /preprocess/base64.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "util/string_piece.hh" 4 | 5 | namespace preprocess { 6 | 7 | void base64_encode(const util::StringPiece &in, std::string &out); 8 | 9 | void base64_decode(const util::StringPiece &in, std::string &out); 10 | 11 | } // namespace preprocess 12 | -------------------------------------------------------------------------------- /preprocess/base64_number_main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util/file_piece.hh" 3 | #include "util/file_stream.hh" 4 | #include "util/tokenize_piece.hh" 5 | #include "preprocess/base64.hh" 6 | 7 | #include 8 | #include 9 | 10 | int main(int argc, char *argv[]) { 11 | std::string out; 12 | util::FileStream writing(1); 13 | uint64_t line_number = 0; 14 | for (util::StringPiece l : util::FilePiece(0)) { 15 | preprocess::base64_decode(l, out); 16 | std::replace(out.begin(), out.end(), '\t', ' '); 17 | for (util::TokenIter line(out, '\n'); line; ++line) { 18 | writing << *line << '\t' << line_number << '\n'; 19 | } 20 | ++line_number; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /preprocess/captive_child.cc: -------------------------------------------------------------------------------- 1 | #include "preprocess/captive_child.hh" 2 | 3 | #include "util/exception.hh" 4 | #include "util/file.hh" 5 | 6 | #include 7 | #ifdef __linux__ 8 | #include 9 | #endif 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | namespace preprocess { 18 | 19 | namespace { 20 | void Pipe(util::scoped_fd &first, util::scoped_fd &second) { 21 | int fds[2]; 22 | UTIL_THROW_IF(pipe(fds), util::ErrnoException, "Creating pipe failed"); 23 | first.reset(fds[0]); 24 | second.reset(fds[1]); 25 | } 26 | } // namespace 27 | 28 | pid_t Launch(char *argv[], util::scoped_fd &in, util::scoped_fd &out) { 29 | util::scoped_fd process_in, process_out; 30 | Pipe(process_in, in); 31 | Pipe(out, process_out); 32 | 33 | // Using self-pipe trick to check whether execvp did not fail: Set up a pipe 34 | // with FD_CLOEXEC (close on successful exec). In case of failure, we'll 35 | // write something to the pipe and close it manually. Then, in the parent we 36 | // can wait till the pipe is closed: either execvp succeeded and we read 37 | // nothing or we read our error code and throw an exception in the parent. 38 | // (See https://stackoverflow.com/a/1586277) 39 | util::scoped_fd status_in, status_out; 40 | Pipe(status_in, status_out); 41 | UTIL_THROW_IF(fcntl(status_out.get(), F_SETFD, fcntl(status_out.get(), F_GETFD) | FD_CLOEXEC), util::ErrnoException, "fcntl failed"); 42 | 43 | pid_t pid = fork(); 44 | UTIL_THROW_IF(pid == -1, util::ErrnoException, "Fork failed"); 45 | if (pid == 0) { 46 | // Inside child process. 47 | #ifdef __linux__ 48 | prctl(PR_SET_PDEATHSIG, SIGTERM); 49 | #endif 50 | UTIL_THROW_IF(-1 == dup2(process_in.get(), STDIN_FILENO), util::ErrnoException, "dup2 failed for process stdin from " << process_in.get()); 51 | UTIL_THROW_IF(-1 == dup2(process_out.get(), STDOUT_FILENO), util::ErrnoException, "dup2 failed for process stdout from " << process_out.get()); 52 | in.reset(); 53 | out.reset(); 54 | status_in.reset(); 55 | execvp(argv[0], argv); 56 | // Oh no, execvp failed, write error to parent 57 | write(status_out.get(), &errno, sizeof(int)); 58 | std::abort(); 59 | } 60 | status_out.reset(); 61 | 62 | // Wait on child to signal successful execvp or error 63 | int count, err; 64 | while ((count = read(*status_in, &err, sizeof(errno))) == -1) 65 | if (errno != EAGAIN && errno != EINTR) 66 | break; 67 | 68 | UTIL_THROW_IF(count != 0, util::Exception, "child's execvp failed: " << strerror(err)); 69 | 70 | // Parent closes parts it doesn't need in destructors. 71 | return pid; 72 | } 73 | 74 | int Wait(pid_t child) { 75 | int status; 76 | UTIL_THROW_IF(-1 == waitpid(child, &status, 0), util::ErrnoException, "waitpid for child failed"); 77 | if (WIFEXITED(status)) { 78 | return WEXITSTATUS(status); 79 | } else { 80 | return 256; 81 | } 82 | } 83 | 84 | } // namespace preprocess 85 | 86 | -------------------------------------------------------------------------------- /preprocess/captive_child.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace util { class scoped_fd; } 6 | 7 | namespace preprocess { 8 | 9 | // Launch a child process. The child's stdin and stdout pipes will be returned as in and out. 10 | pid_t Launch(char *argv[], util::scoped_fd &in, util::scoped_fd &out); 11 | 12 | // Wait for a child to finish and return an appropriate status for it. 13 | int Wait(pid_t child); 14 | 15 | } // namespace preprocess 16 | -------------------------------------------------------------------------------- /preprocess/commoncrawl_dedupe_main.cc: -------------------------------------------------------------------------------- 1 | // Tool to convert raw CommonCrawl files into deduplicated files. 2 | // Strips leading and trailing spaces. 3 | // Removes document delimiter lines (those that begin with df6fa1abb58549287111ba8d776733e9). 4 | // Removes duplicate lines. 5 | // Removes any line that contains invalid UTF-8. 6 | // 7 | #include "util/file_stream.hh" 8 | #include "util/file_piece.hh" 9 | #include "util/murmur_hash.hh" 10 | #include "util/probing_hash_table.hh" 11 | #include "util/scoped.hh" 12 | #include "util/utf8.hh" 13 | 14 | #include 15 | 16 | #include 17 | 18 | namespace { 19 | 20 | // Hash table with 64-bit keys. 21 | struct Entry { 22 | typedef uint64_t Key; 23 | uint64_t key; 24 | uint64_t GetKey() const { return key; } 25 | void SetKey(uint64_t to) { key = to; } 26 | }; 27 | 28 | typedef util::AutoProbing Table; 29 | 30 | // Use 64-bit MurmurHash in the hash table. 31 | bool IsNewLine(Table &table, util::StringPiece l) { 32 | Table::MutableIterator it; 33 | Entry entry; 34 | entry.key = util::MurmurHashNative(l.data(), l.size(), 1); 35 | return !table.FindOrInsert(entry, it); 36 | } 37 | 38 | // Remove leading and trailing space characters. 39 | util::StringPiece StripSpaces(util::StringPiece ret) { 40 | while (ret.size() && util::kSpaces[static_cast(*ret.data())]) { 41 | ret = util::StringPiece(ret.data() + 1, ret.size() - 1); 42 | } 43 | while (ret.size() && util::kSpaces[static_cast(ret.data()[ret.size() - 1])]) { 44 | ret = util::StringPiece(ret.data(), ret.size() - 1); 45 | } 46 | return ret; 47 | } 48 | 49 | 50 | } // namespace 51 | 52 | int main(int argc, char *argv[]) { 53 | if (argc > 2 || (argc == 2 && (!strcmp("-h", argv[1]) || !strcmp("--help", argv[1])))) { 54 | std::cerr << "Usage: " << argv[0] << " file_to_remove\nLines that appear in file_to_remove will be excluded from the output.\n" << std::endl; 55 | return 1; 56 | } 57 | try { 58 | Table table; 59 | util::StringPiece l; 60 | 61 | // If there's a file to remove lines from, add it to the hash table of lines. 62 | if (argc == 2) { 63 | util::FilePiece removing(argv[1]); 64 | while (removing.ReadLineOrEOF(l)) { 65 | IsNewLine(table, StripSpaces(l)); 66 | } 67 | } 68 | 69 | // This is the beginning of a line that delimits documents in the raw files. 70 | const util::StringPiece remove_line("df6fa1abb58549287111ba8d776733e9"); 71 | util::FileStream out(1); 72 | util::FilePiece in(0, "stdin", &std::cerr); 73 | while (in.ReadLineOrEOF(l)) { 74 | l = StripSpaces(l); 75 | // A line passes if: 76 | // It does not begin with the magic document delimiter. 77 | // Its 64-bit hash has not been seen before. 78 | // and it is valid UTF-8. 79 | if (!starts_with(l, remove_line) && IsNewLine(table, l) && util::IsUTF8(l)) { 80 | out << l << '\n'; 81 | } 82 | } 83 | } 84 | catch (const std::exception &e) { 85 | std::cerr << e.what() << std::endl; 86 | return 1; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /preprocess/dedupe_main.cc: -------------------------------------------------------------------------------- 1 | #include "fields.hh" 2 | #include "parallel.hh" 3 | #include "util/murmur_hash.hh" 4 | #include "util/probing_hash_table.hh" 5 | #include "util/scoped.hh" 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | 14 | namespace preprocess { 15 | namespace { 16 | 17 | struct Options { 18 | std::vector key_fields; 19 | char delim; 20 | std::vector files; 21 | }; 22 | 23 | void ParseArgs(int argc, char *argv[], Options &out) { 24 | namespace po = boost::program_options; 25 | po::options_description desc("Deduplication settings"); 26 | std::string fields; 27 | 28 | desc.add_options() 29 | ("help,h", po::bool_switch(), "Show this help message") 30 | ("fields,f", po::value(&fields)->default_value("1-"), "Fields to use for key like cut -f") 31 | ("delim,d", po::value(&out.delim)->default_value('\t'), "Field delimiter") 32 | ("parallel,p", po::value(&out.files)->multitoken(), "Filter parallel data using four files: in_en in_fr out_en out_fr"); 33 | po::positional_options_description pd; 34 | pd.add("parallel", -1); 35 | 36 | po::variables_map vm; 37 | po::store(po::command_line_parser(argc, argv).options(desc).positional(pd).run(), vm); 38 | if (vm["help"].as() || (!out.files.empty() && out.files.size() != 4)) { 39 | std::cerr << 40 | "Deduplicate lines in a file.\n" 41 | "Only 64-bit hashes are kept. In the event of a hash collision, a unique line\n" 42 | "will be removed.\n" 43 | "By default the entire line is used as the key for equality. Using -f and -d\n" 44 | "similar to cut, the key can be restricted to some columns. The line containing\n" 45 | "the first instance of the key is preserved, while the rest are removed.\n" << 46 | desc << 47 | "Deduplicate lines in a file: " << argv[0] << " out\n" 48 | "Deduplicate parallel data, removing if either side is non-unique " << argv[0] << " -p in_en in_fr out_en out_fr\n"; 49 | exit(1); 50 | } 51 | po::notify(vm); 52 | 53 | ParseFields(fields.c_str(), out.key_fields); 54 | DefragmentFields(out.key_fields); 55 | } 56 | 57 | struct Entry { 58 | typedef uint64_t Key; 59 | uint64_t key; 60 | uint64_t GetKey() const { return key; } 61 | void SetKey(uint64_t to) { key = to; } 62 | }; 63 | 64 | class Dedupe { 65 | public: 66 | bool operator()(const util::StringPiece &line) { 67 | return (*this)(util::MurmurHashNative(line.data(), line.size(), 1)); 68 | } 69 | 70 | bool operator()(uint64_t key) { 71 | Entry entry; 72 | entry.key = key; 73 | Table::MutableIterator it; 74 | return !table_.FindOrInsert(entry, it); 75 | } 76 | 77 | private: 78 | typedef util::AutoProbing Table; 79 | Table table_; 80 | }; 81 | 82 | class FieldDedupe : public Dedupe { 83 | public: 84 | explicit FieldDedupe(const Options &options) 85 | : key_fields_(options.key_fields), delim_(options.delim) {} 86 | 87 | bool operator()(const util::StringPiece &line) { 88 | HashCallback hasher(1); 89 | RangeFields(line, key_fields_, delim_, hasher); 90 | return (*static_cast(this))(hasher.Hash()); 91 | } 92 | 93 | private: 94 | std::vector key_fields_; 95 | char delim_; 96 | }; 97 | 98 | } // namespace 99 | } // namespace preprocess 100 | 101 | int main(int argc, char *argv[]) { 102 | preprocess::Options options; 103 | ParseArgs(argc, argv, options); 104 | 105 | if (options.key_fields.size() == 1 && options.key_fields[0].begin == 0 && options.key_fields[0].end == preprocess::FieldRange::kInfiniteEnd) { 106 | return preprocess::FilterParallel(options.files); 107 | } else { 108 | return preprocess::FilterParallel(options.files, options); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /preprocess/fields.cc: -------------------------------------------------------------------------------- 1 | #include "preprocess/fields.hh" 2 | #include "util/exception.hh" 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace preprocess { 9 | 10 | namespace { 11 | unsigned int ConsumeInt(const char *&arg) { 12 | char *end; 13 | unsigned int ret = strtoul(arg, &end, 10); 14 | UTIL_THROW_IF(end == arg, util::Exception, "Expected field " << arg << " to begin with a number."); 15 | arg = end; 16 | return ret; 17 | } 18 | } // namespace 19 | 20 | void ParseFields(const char *arg, std::vector &indices) { 21 | FieldRange add; 22 | while (*arg) { 23 | if (*arg == '-') { 24 | add.begin = 0; 25 | } else { 26 | // -1 because cut is 1-indexed. 27 | add.begin = ConsumeInt(arg) - 1; 28 | } 29 | switch (*arg) { 30 | case ',': case 0: 31 | add.end = add.begin + 1; 32 | break; 33 | case '-': 34 | ++arg; 35 | if (*arg == 0 || *arg == ',') { 36 | // 5- 37 | add.end = FieldRange::kInfiniteEnd; 38 | } else { 39 | // 5-6 40 | add.end = ConsumeInt(arg); 41 | UTIL_THROW_IF(add.end <= add.begin, util::Exception, "Empty range [" << add.begin << ", " << add.end << ")"); 42 | } 43 | break; 44 | default: 45 | UTIL_THROW(util::Exception, "Expected , - or string end after number in " << arg); 46 | } 47 | // Swallow , 48 | if (*arg == ',') { 49 | ++arg; 50 | } 51 | indices.push_back(add); 52 | } 53 | } 54 | 55 | void DefragmentFields(std::vector &indices) { 56 | std::sort(indices.begin(), indices.end()); 57 | for (unsigned int i = 1; i < indices.size();) { 58 | UTIL_THROW_IF(indices[i-1].end > indices[i].begin, util::Exception, "Overlapping index ranges"); 59 | if (indices[i-1].end == indices[i].begin) { 60 | indices[i-1].end = indices[i].end; 61 | indices.erase(indices.begin() + i); 62 | } else { 63 | ++i; 64 | } 65 | } 66 | } 67 | 68 | } // namespace preprocess 69 | -------------------------------------------------------------------------------- /preprocess/fields.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/string_piece.hh" 4 | #include "util/murmur_hash.hh" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace preprocess { 11 | 12 | // [begin, end) as is the custom of our people. 13 | struct FieldRange { 14 | // Note that end can be the maximum integer. 15 | unsigned int begin, end; 16 | bool operator<(const FieldRange &other) const { 17 | return begin < other.begin; 18 | } 19 | static const unsigned int kInfiniteEnd = std::numeric_limits::max(); 20 | }; 21 | 22 | // Parse the cut-style 1-3,9,12- representation of fields. 23 | void ParseFields(const char *arg, std::vector &indices); 24 | 25 | // Sort and combine field ranges into smaller ones. 26 | void DefragmentFields(std::vector &indices); 27 | 28 | // Do a callback with each individual field that was selected. 29 | template inline bool IndividualFields(util::StringPiece str, const std::vector &indices, char delim, Functor &callback) { 30 | const char *begin = str.data(); 31 | const char *const end = str.data() + str.size(); 32 | unsigned int index = 0; 33 | for (const FieldRange f : indices) { 34 | for (; index < f.begin; ++index) { 35 | begin = std::find(begin, end, delim) + 1; 36 | if (begin >= end) return true; 37 | } 38 | for (; index < f.end; ++index) { 39 | const char *found = std::find(begin, end, delim); 40 | if (!callback(util::StringPiece(begin, found - begin))) { 41 | return false; 42 | } 43 | begin = found + 1; 44 | if (begin >= end) return true; 45 | } 46 | } 47 | return true; 48 | } 49 | 50 | // Do a callback with ranges of fields. 51 | template inline void RangeFields(util::StringPiece str, const std::vector &indices, char delim, Functor &callback) { 52 | const char *begin = str.data(); 53 | const char *const end = str.data() + str.size(); 54 | unsigned int index = 0; 55 | for (const FieldRange f : indices) { 56 | for (; index < f.begin; ++index) { 57 | begin = std::find(begin, end, delim) + 1; 58 | if (begin >= end) return; 59 | } 60 | if (f.end == FieldRange::kInfiniteEnd) { 61 | callback(util::StringPiece(begin, end - begin)); 62 | return; 63 | } 64 | const char *old_begin = begin; 65 | for (; index < f.end; ++index) { 66 | const char *found = std::find(begin, end, delim); 67 | begin = found + 1; 68 | if (begin >= end) { 69 | callback(util::StringPiece(old_begin, end - old_begin)); 70 | return; 71 | } 72 | } 73 | callback(util::StringPiece(old_begin, begin - old_begin - 1)); 74 | } 75 | return; 76 | } 77 | 78 | // This is called with the parts of the input that relate to the key. 79 | class HashCallback { 80 | public: 81 | explicit HashCallback(uint64_t seed = 47849374332489ULL) : hash_(seed) /* Be different from deduper */ {} 82 | 83 | void operator()(util::StringPiece key) { 84 | hash_ = util::MurmurHashNative(key.data(), key.size(), hash_); 85 | } 86 | 87 | uint64_t Hash() const { return hash_; } 88 | 89 | private: 90 | uint64_t hash_; 91 | }; 92 | 93 | } // namespace preprocess 94 | -------------------------------------------------------------------------------- /preprocess/gigaword_extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Extract sentences from gigaword but don't process them 3 | set -e -o pipefail 4 | BINDIR="$(dirname "$0")" 5 | if [ ${#1} != 2 ]; then 6 | echo "Expected language on the command line." 1>&2 7 | exit 1 8 | fi 9 | $BINDIR/gigaword_unwrap | $BINDIR/../moses/ems/support/split-sentences.perl -l $1 |fgrep -v "

" 10 | -------------------------------------------------------------------------------- /preprocess/heuristics.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | #More preprocessing. This assumes that process_unicode is run with at minimum --flatten 1 --normalize 1 first. 3 | 4 | use strict; 5 | use utf8; 6 | 7 | binmode STDIN, ":utf8"; 8 | binmode STDOUT, ":utf8"; 9 | binmode STDERR, ":utf8"; 10 | 11 | my $language = "en"; 12 | 13 | while (@ARGV) { 14 | $_ = shift; 15 | /^-l$/ && ($language = shift, next); 16 | } 17 | 18 | while(my $eline = ) 19 | { 20 | chomp $eline; 21 | $eline = " $eline "; 22 | 23 | #Normalize long chains of underscores to just two. 24 | $eline =~ s/_\s*_[\s_]*/ __ /g; 25 | 26 | #Silja dropped * entirely. I keep one. Bullet points are converted to * by a Chris Dyer rule in process_unicode. 27 | $eline =~ s/\*\s*\*[\s\*]*/ * /g; 28 | #Silja, originally for prepgigaword-silja.pl 29 | $eline =~ s/#+//g; 30 | $eline =~ s/[\!]+/!/g; 31 | $eline =~ s/!([^ ])/! $1/g; 32 | $eline =~ s/\.([^\s\d.])/. $1/g; 33 | $eline =~ s/\+(\D)/+ $1/g; 34 | $eline =~ s/(\D)\+/$1 +/g; 35 | $eline =~ s/,(\D)/, $1/g; 36 | $eline =~ s/(\s)-([^\s\d\-])/$1- $2/g; 37 | $eline =~ s/^ *-- *//g; 38 | #The next rule was botching ellipses. . . 39 | #$eline =~ s/\.\./ . /g; 40 | 41 | #Greg 42 | #Gigaword apw does this. 43 | $eline =~ s/ dlrs / \$ /g; 44 | if ($language == "fr") { 45 | $eline =~ s/([^ -]+)-t-(je|j'|tu|il|elle|on|nous|vous|ils|elles|me|m'|te|t'|le|l'|la|les|lui|leur|moi|toi|eux|elles|ce|c'|ça|ceci|cela|qui|ci|là) /\1 -t-\2 /gi; 46 | $eline =~ s/([^ -]+)-(je|j'|tu|il|elle|on|nous|vous|ils|elles|me|m'|te|t'|le|l'|la|les|lui|leur|moi|toi|eux|elles|ce|c'|ça|ceci|cela|qui|ci|là) /\1 -\2 /gi; 47 | $eline =~ s/\s+(qu|c|d|l|j|s|n|m|lorsqu|puisqu)\s+'\s+/ \1' /gi; 48 | $eline =~ s/\s+aujourd\s*'\s*hui\s+/ aujourd'hui /gi; 49 | } 50 | 51 | #Chris Dyer, t2.perl 52 | if ($language == "en") { 53 | $eline =~ s/ élite / elite /gi; 54 | $eline =~ s/ (s|at) & (t|p) / $1&$2 /ig; 55 | $eline =~ s/ (full|half|part) - (time) / $1-$2 /ig; 56 | $eline =~ s/ (vis|viz) - (.|..) - (vis|viz) / vis-à-vis /ig; 57 | $eline =~ s/ (short|long|medium|one|half|two|on|off|in|post|ex|multi|de|mid|co|inter|intra|anti|re|pre|e|non|pro|self) - / $1- /ig; 58 | 59 | #kheafiel 60 | $eline =~ s/ (ca|are|do|could|did|does|do|had|has|have|is|must|need|should|was|were|wo|would)n 't / \1n't /gi; 61 | } 62 | $eline =~ s/ ([AaEe][Ll]) - / \1-/g; 63 | 64 | if ($language != "de") { 65 | #Take out any "words" that are longer than 50 chars 66 | $eline =~ s/\S{50,}/-/g; 67 | } 68 | 69 | $eline =~ s/\.\s*\.\s*\.\s*[\.\s]*/ ... /g; 70 | $eline =~ s/!\s*![!\s]*/ ! /g; 71 | $eline =~ s/\?\s*\?[\?\s]*/ ? /g; 72 | $eline =~ s/ ' s / 's /g; 73 | #cut multiple hyphens down to one and space separate it (single hyphens are not space separated) 74 | $eline =~ s/([^-])--+([^-])/$1 - $2/g; 75 | 76 | #Delete excess spaces: 77 | $eline =~ s/\s+/ /g; 78 | $eline =~ s/^\s+//; 79 | $eline =~ s/\s+$//; 80 | 81 | print "$eline\n"; 82 | } 83 | 84 | -------------------------------------------------------------------------------- /preprocess/idf_main.cc: -------------------------------------------------------------------------------- 1 | /* Computes inverse document frequency for each token seen in the input. A document is a line. */ 2 | #include "util/file_piece.hh" 3 | #include "util/murmur_hash.hh" 4 | #include "util/pool.hh" 5 | #include "util/probing_hash_table.hh" 6 | #include "util/tokenize_piece.hh" 7 | #include "util/file_stream.hh" 8 | 9 | #include 10 | #include 11 | 12 | struct Entry { 13 | typedef uint64_t Key; 14 | uint64_t hash; 15 | 16 | uint64_t GetKey() const { return hash; } 17 | void SetKey(uint64_t to) { hash = to; } 18 | 19 | // Should be allocated from pool to ensure survival. 20 | util::StringPiece str; 21 | 22 | uint64_t document_count; 23 | }; 24 | 25 | int main() { 26 | uint64_t documents = 0; 27 | util::Pool strings; 28 | util::AutoProbing words; 29 | Entry ent; 30 | ent.document_count = 1; 31 | for (util::StringPiece line : util::FilePiece(0)) { 32 | ++documents; 33 | std::unordered_set seen_in_line; 34 | for (util::TokenIter it(line, util::kSpaces); it; ++it) { 35 | ent.hash = util::MurmurHashNative(it->data(), it->size()); 36 | if (seen_in_line.insert(ent.hash).second) { 37 | // Newly seen in this line. 38 | util::AutoProbing::MutableIterator words_it; 39 | if (words.FindOrInsert(ent, words_it)) { 40 | ++(words_it->document_count); 41 | } else { 42 | char *data = static_cast(strings.Allocate(it->size())); 43 | memcpy(data, it->data(), it->size()); 44 | words_it->str = util::StringPiece(data, it->size()); 45 | } 46 | } 47 | } 48 | } 49 | double documents_log = std::log(static_cast(documents)); 50 | util::FileStream out(1); 51 | for (util::AutoProbing::ConstIterator i = words.RawBegin(); i != words.RawEnd(); ++i) { 52 | if (i->GetKey()) { 53 | double count = static_cast(i->document_count); 54 | double idf = documents_log - std::log(count); 55 | out << i->str << ' ' << idf << '\n'; 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /preprocess/mmhsum_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/murmur_hash.hh" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | int main(int argc, char *argv[]) { 9 | if (argc > 1) { 10 | std::cerr << "Usage: [stdin] " << argv[0] << std::endl; 11 | return 1; 12 | } 13 | 14 | constexpr size_t bufferSize = 1024*1024; 15 | std::vector buffer(bufferSize); 16 | uint64_t chained_hash = 0; 17 | 18 | while (std::cin) 19 | { 20 | std::cin.read(&buffer[0], bufferSize); 21 | if(std::cin.bad()){ 22 | std::cerr << "Error trying to read from stdin\n"; 23 | return 1; 24 | } 25 | size_t count = std::cin.gcount(); 26 | if (!count) 27 | break; 28 | chained_hash = util::MurmurHashNative(&buffer[0], count, chained_hash); 29 | } 30 | std::cout << std::hex << chained_hash << '\n'; 31 | } 32 | -------------------------------------------------------------------------------- /preprocess/order_independent_hash_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/murmur_hash.hh" 2 | #include "util/file_piece.hh" 3 | 4 | int main() { 5 | uint64_t sum = 0; 6 | for (util::StringPiece line : util::FilePiece(0)) { 7 | sum += util::MurmurHash64A(line.data(), line.size()); 8 | } 9 | std::cout << sum << std::endl; 10 | } 11 | -------------------------------------------------------------------------------- /preprocess/parallel.hh: -------------------------------------------------------------------------------- 1 | #ifndef PREPROCESS_PARALLEL__ 2 | #define PREPROCESS_PARALLEL__ 3 | 4 | #include "util/file_stream.hh" 5 | #include "util/file_piece.hh" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | namespace preprocess { 14 | 15 | template int FilterParallel(const std::vector &files, PassArguments&&... pass_construct) { 16 | uint64_t input = 0, output = 0; 17 | if (files.empty()) { 18 | Pass pass(std::forward(pass_construct)...); 19 | util::StringPiece line; 20 | util::FilePiece in(0, NULL, &std::cerr); 21 | util::FileStream out(1); 22 | while (true) { 23 | try { 24 | line = in.ReadLine(); 25 | } catch (const util::EndOfFileException &e) { break; } 26 | ++input; 27 | if (pass(line)) { 28 | out << line << '\n'; 29 | ++output; 30 | } 31 | } 32 | } else if (files.size() == 4) { 33 | Pass pass0(std::forward(pass_construct)...), pass1(std::forward(pass_construct)...); 34 | util::StringPiece line0, line1; 35 | util::FilePiece in0(files[0].c_str(), &std::cerr), in1(files[1].c_str()); 36 | util::FileStream out0(util::CreateOrThrow(files[2].c_str())), out1(util::CreateOrThrow(files[3].c_str())); 37 | while (true) { 38 | try { 39 | line0 = in0.ReadLine(); 40 | } catch (const util::EndOfFileException &e) { break; } 41 | line1 = in1.ReadLine(); 42 | ++input; 43 | if (pass0(line0) && pass1(line1)) { 44 | out0 << line0 << '\n'; 45 | out1 << line1 << '\n'; 46 | ++output; 47 | } 48 | } 49 | try { 50 | line1 = in1.ReadLine(); 51 | std::cerr << "Input is not balaced: " << files[1] << " has " << line1 << std::endl; 52 | return 2; 53 | } catch (const util::EndOfFileException &e) {} 54 | } else { 55 | std::cerr << 56 | "To filter from stdin to stdout, run without an argument.\n" 57 | "To filter parallel files, run in0 in1 out0 out1\n"; 58 | return 1; 59 | } 60 | std::cerr << "Kept " << output << " / " << input << " = " << (static_cast(output) / static_cast(input)) << std::endl; 61 | return 0; 62 | } 63 | 64 | } // namespace preprocess 65 | #endif 66 | -------------------------------------------------------------------------------- /preprocess/process_unicode_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/utf8.hh" 2 | #include "util/utf8_icu.hh" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | using U_ICU_NAMESPACE::UnicodeString; 16 | 17 | namespace { 18 | struct Options { 19 | std::string language; 20 | bool lower; 21 | bool flatten; 22 | bool normalize; 23 | }; 24 | void ParseArgs(int argc, char *argv[], Options &out) { 25 | namespace po = boost::program_options; 26 | po::options_description desc("Unicode treatment options"); 27 | desc.add_options() 28 | ("language,l", po::value(&out.language)->default_value("en"), "Language (only applies to flatten)") 29 | ("lower", po::bool_switch(&out.lower)->default_value(false), "Convert to lowercase") 30 | ("flatten", po::bool_switch(&out.flatten)->default_value(false), "Canonicalize some characters for English") 31 | ("normalize", po::bool_switch(&out.normalize)->default_value(false), "Normalize Unicode format"); 32 | po::variables_map vm; 33 | po::store(po::parse_command_line(argc, argv, desc), vm); 34 | po::notify(vm); 35 | } 36 | } // namespace 37 | 38 | int main(int argc, char *argv[]) { 39 | Options opt; 40 | ParseArgs(argc, argv, opt); 41 | util::Flatten flatten(opt.language); 42 | std::string line, normalized; 43 | UnicodeString str[2]; 44 | UnicodeString *cur = &str[0], *tmp = &str[1]; 45 | while (getline(std::cin, line)) { 46 | *cur = UnicodeString::fromUTF8(line); 47 | if (opt.lower) { 48 | cur->toLower(); 49 | } 50 | if (opt.flatten) { 51 | flatten.Apply(*cur, *tmp); 52 | std::swap(cur, tmp); 53 | } 54 | if (opt.normalize) { 55 | util::Normalize(*cur, *tmp); 56 | std::swap(cur, tmp); 57 | } 58 | std::cout << *str << '\n'; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /preprocess/remove_invalid_utf8_base64_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_stream.hh" 2 | #include "util/file_piece.hh" 3 | #include "util/utf8.hh" 4 | 5 | #include "base64.hh" 6 | 7 | int main() { 8 | util::FilePiece in(0); 9 | util::FileStream out(1); 10 | util::StringPiece line; 11 | std::string decoded; 12 | std::string empty_base64; 13 | preprocess::base64_encode("", empty_base64); 14 | while (in.ReadLineOrEOF(line)) { 15 | preprocess::base64_decode(line, decoded); 16 | if (util::IsUTF8(decoded)) { 17 | out << line << '\n'; 18 | } else { 19 | out << empty_base64 << '\n'; 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /preprocess/remove_invalid_utf8_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_stream.hh" 2 | #include "util/file_piece.hh" 3 | #include "util/utf8.hh" 4 | 5 | int main() { 6 | util::FilePiece in(0); 7 | util::FileStream out(1); 8 | util::StringPiece line; 9 | while (in.ReadLineOrEOF(line)) { 10 | if (util::IsUTF8(line)) { 11 | out << line << '\n'; 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /preprocess/remove_long_lines_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_stream.hh" 2 | #include "util/file_piece.hh" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | int main(int argc, char *argv[]) { 10 | std::size_t limit; 11 | if (argc == 1) { 12 | limit = 2000; 13 | } else if (argc == 2) { 14 | limit = boost::lexical_cast(argv[1]); 15 | } else { 16 | std::cerr << "Usage: " << argv[0] << " [length limit in bytes]" << std::endl; 17 | return 1; 18 | } 19 | util::FilePiece f(0, NULL, &std::cerr); 20 | util::FileStream out(1); 21 | try { 22 | while (true) { 23 | util::StringPiece l = f.ReadLine(); 24 | if (l.size() <= limit) { 25 | out << l << '\n'; 26 | } 27 | } 28 | } catch (const util::EndOfFileException &e) {} 29 | } 30 | -------------------------------------------------------------------------------- /preprocess/resplit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -o pipefail 3 | BINDIR="$(dirname "$0")" 4 | #Argument 1 is language 5 | l="$1" 6 | if [ ${#l} == 0 ]; then 7 | echo "Argument is language" 1>&2 8 | exit 1 9 | fi 10 | sed 's/^/

\n/' | $BINDIR/../moses/ems/support/split-sentences.perl -l $1 |fgrep -vx "

" 11 | -------------------------------------------------------------------------------- /preprocess/shard_main.cc: -------------------------------------------------------------------------------- 1 | #include "preprocess/fields.hh" 2 | #include "util/buffered_stream.hh" 3 | #include "util/threaded_buffered_stream.hh" 4 | #include "util/file_piece.hh" 5 | #include "util/fixed_array.hh" 6 | #include "util/murmur_hash.hh" 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | namespace preprocess { 15 | 16 | struct Options { 17 | std::vector key_fields; 18 | char delim; 19 | std::vector outputs; 20 | util::WriteCompressed::Compression compression; 21 | }; 22 | 23 | void ParseArgs(int argc, char *argv[], Options &out) { 24 | namespace po = boost::program_options; 25 | po::options_description desc("Arguments"); 26 | std::string fields; 27 | std::string prefix; 28 | std::string compression_string; 29 | unsigned int number; 30 | 31 | desc.add_options() 32 | ("help,h", po::bool_switch(), "Show this help message") 33 | ("fields,f", po::value(&fields)->default_value("1-"), "Fields to use for key like cut -f") 34 | ("delim,d", po::value(&out.delim)->default_value('\t'), "Field delimiter") 35 | ("prefix,p", po::value(&prefix), "Prefix and count of outputs") 36 | ("number,n", po::value(&number), "Number of shards") 37 | ("output,o", po::value(&out.outputs)->multitoken(), "Output file names (or just list them without -o)") 38 | ("compress,c", po::value(&compression_string)->default_value("none"), "Compression. One of none, gzip, or bzip2"); 39 | 40 | po::positional_options_description pd; 41 | pd.add("output", -1); 42 | 43 | po::variables_map vm; 44 | po::store(po::command_line_parser(argc, argv).options(desc).positional(pd).run(), vm); 45 | if (argc == 1 || vm["help"].as()) { 46 | std::cerr << 47 | "Shards stdin into multiple files by the hash of the key.\n" << 48 | "Output is specified as --prefix prefix --number n or just listing file names.\n" << 49 | desc << 50 | "Examples:\n" << 51 | argv[0] << " a b #Shards stdin to files a and b using the whole line as key.\n" << 52 | argv[0] << " a b c #Shards stdin to files a, b, and c using the whole line as key.\n" << 53 | argv[0] << " -f 1 a b #Shards stdin to files a and b using tab-delimited field 1.\n" << 54 | argv[0] << " -d ' ' -f 1 a b #Shards stdin to files a and b using space-delimited field 1." << std::endl; 55 | exit(1); 56 | } 57 | po::notify(vm); 58 | 59 | ParseFields(fields.c_str(), out.key_fields); 60 | DefragmentFields(out.key_fields); 61 | 62 | if (out.outputs.empty()) { 63 | UTIL_THROW_IF2(!vm.count("prefix"), "Specify outputs using --outputs or e.g. --prefix pre --number 2"); 64 | UTIL_THROW_IF2(!vm.count("number"), "--prefix specified but we need to know how many shards with -n"); 65 | // How many digits will be in the 0-indexed representation? 66 | unsigned int digits = 0; 67 | for (unsigned int compare = number - 1; compare; ++digits, compare /= 10) {} 68 | std::ostringstream stream; 69 | stream << std::setfill('0') << std::setw(digits); 70 | for (unsigned int i = 0; i < number; ++i) { 71 | stream << std::setw(digits) << i; 72 | out.outputs.push_back(prefix + stream.str()); 73 | stream.str(std::string()); 74 | stream.clear(); 75 | } 76 | } else { 77 | UTIL_THROW_IF2(vm.count("prefix"), "Specify --prefix or --output"); 78 | UTIL_THROW_IF2(vm.count("number") && number != out.outputs.size(), "Number of outputs does not match"); 79 | } 80 | if (compression_string == "none") { 81 | out.compression = util::WriteCompressed::NONE; 82 | } else if (compression_string == "gzip") { 83 | out.compression = util::WriteCompressed::GZIP; 84 | } else if (compression_string == "bzip2") { 85 | out.compression = util::WriteCompressed::BZIP; 86 | } else { 87 | UTIL_THROW(util::Exception, "Unknown compression algorithm " << compression_string); 88 | } 89 | } 90 | 91 | } // namespace preprocess 92 | 93 | int main(int argc, char *argv[]) { 94 | preprocess::Options options; 95 | preprocess::ParseArgs(argc, argv, options); 96 | uint64_t shard_count = options.outputs.size(); 97 | 98 | util::FilePiece in(0); 99 | util::StringPiece line; 100 | util::FixedArray > out(options.outputs.size()); 101 | std::string output(argv[1]); 102 | for (const std::string &o : options.outputs) { 103 | out.push_back(util::CreateOrThrow(o.c_str()), options.compression); 104 | } 105 | while (in.ReadLineOrEOF(line)) { 106 | preprocess::HashCallback cb; 107 | preprocess::RangeFields(line, options.key_fields, options.delim, cb); 108 | out[cb.Hash() % shard_count] << line << '\n'; 109 | } 110 | return 0; 111 | } 112 | -------------------------------------------------------------------------------- /preprocess/substitute_main.cc: -------------------------------------------------------------------------------- 1 | #include "preprocess/fields.hh" 2 | #include "util/file_stream.hh" 3 | #include "util/file_piece.hh" 4 | #include "util/murmur_hash.hh" 5 | #include "util/pool.hh" 6 | #include "util/probing_hash_table.hh" 7 | #include 8 | 9 | struct Entry { 10 | typedef uint64_t Key; 11 | Key key; 12 | uint64_t GetKey() const { return key; } 13 | void SetKey(uint64_t to) { key = to; } 14 | util::StringPiece value; 15 | }; 16 | 17 | class RecordCallback { 18 | public: 19 | RecordCallback(util::StringPiece *to) : i_(to) {} 20 | 21 | void operator()(util::StringPiece str) { 22 | *(i_++) = str; 23 | } 24 | 25 | const util::StringPiece *Position() const { return i_; } 26 | 27 | private: 28 | util::StringPiece *i_; 29 | }; 30 | 31 | int main() { 32 | std::vector fields; 33 | fields.resize(4); 34 | util::StringPiece segments[4]; 35 | fields[0].begin = 0; 36 | fields[0].end = 2; 37 | util::StringPiece &sentences = segments[1]; 38 | fields[1].begin = 2; 39 | fields[1].end = 4; 40 | util::StringPiece &value = segments[2]; 41 | fields[2].begin = 4; 42 | fields[2].end = 5; 43 | util::StringPiece &after = segments[3]; 44 | fields[3].begin = 5; 45 | fields[3].end = preprocess::FieldRange::kInfiniteEnd; 46 | 47 | util::Pool string_pool; 48 | util::FileStream out(1); 49 | 50 | typedef util::AutoProbing Table; 51 | Table table; 52 | for (util::StringPiece line : util::FilePiece(0)) { 53 | RecordCallback cb(segments); 54 | preprocess::RangeFields(line, fields, '\t', cb); 55 | UTIL_THROW_IF2(cb.Position() != segments + 4, "Did not get all fields in line " << line); 56 | Entry entry; 57 | entry.key = util::MurmurHashNative(sentences.data(), sentences.size()); 58 | Table::MutableIterator it; 59 | if (table.FindOrInsert(entry, it)) { 60 | out << util::StringPiece(line.data(), sentences.data() + sentences.size() - line.data()); 61 | out << '\t' << it->value << '\t'; 62 | out << after; 63 | } else { 64 | char *mem = static_cast(memcpy(string_pool.Allocate(value.size()), value.data(), value.size())); 65 | it->value = util::StringPiece(mem, value.size()); 66 | out << line; 67 | } 68 | out << '\n'; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /preprocess/subtract_lines_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_piece.hh" 2 | #include "util/file_stream.hh" 3 | #include "util/murmur_hash.hh" 4 | #include "util/probing_hash_table.hh" 5 | 6 | #include 7 | 8 | struct Entry { 9 | typedef uint64_t Key; 10 | uint64_t key; 11 | uint64_t GetKey() const { return key; } 12 | void SetKey(uint64_t to) { key = to; } 13 | }; 14 | 15 | int main(int argc, char *argv[]) { 16 | if (argc != 2) { 17 | std::cerr << "Usage: " << argv[0] << " subtract output\n" 18 | "Copies from stdin to stdout, skipping lines that appear in `subtract`.\n" 19 | "The subtraction is approximate, based on the hash of the line.\n" 20 | "This is set subtraction. All copies of a line are removed.\n"; 21 | return 1; 22 | } 23 | util::AutoProbing table; 24 | // Load subtraction into table. 25 | for (util::StringPiece line : util::FilePiece(argv[1])) { 26 | Entry entry; 27 | entry.key = util::MurmurHashNative(line.data(), line.size(), 1); 28 | util::AutoProbing::MutableIterator it; 29 | table.FindOrInsert(entry, it); 30 | } 31 | util::FileStream out(1); 32 | for (util::StringPiece line : util::FilePiece(0)) { 33 | uint64_t key = util::MurmurHashNative(line.data(), line.size(), 1); 34 | util::AutoProbing::ConstIterator it; 35 | if (!table.Find(key, it)) { 36 | out << line << '\n'; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /preprocess/tests/cache/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . "$(dirname "$0")"/../vars 3 | diff <("$BIN"/cache cat <"$CUR"/input) "$CUR"/input 4 | diff <("$BIN"/cache -t " " -k 1 cat <"$CUR"/input) "$CUR"/space_expected 5 | 6 | -------------------------------------------------------------------------------- /preprocess/tests/cache/space_ref.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | lines = {} 4 | for l in sys.stdin: 5 | key = l[0:-1].split(' ')[0] 6 | if key in lines: 7 | sys.stdout.write(lines[key]) 8 | else: 9 | lines[key] = l 10 | sys.stdout.write(l) 11 | -------------------------------------------------------------------------------- /preprocess/tests/dedupe/columns: -------------------------------------------------------------------------------- 1 | 1 a 2 | 2 a 3 | 3 b 4 | 4 a 5 | 5 a 6 | 6 b 7 | 7 b 8 | -------------------------------------------------------------------------------- /preprocess/tests/dedupe/columns.out: -------------------------------------------------------------------------------- 1 | 1 a 2 | 3 b 3 | -------------------------------------------------------------------------------- /preprocess/tests/dedupe/ref.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | lines = set() 4 | for l in sys.stdin: 5 | if l not in lines: 6 | lines.add(l) 7 | sys.stdout.write(l) 8 | -------------------------------------------------------------------------------- /preprocess/tests/dedupe/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . "$(dirname "$0")"/../vars 3 | diff <("$BIN/dedupe" <"$CUR/input") "$CUR/expected" 4 | "$BIN"/dedupe "$CUR"/input <(rev "$CUR"/input) "$TMP"/output0 "$TMP"/output1 5 | diff "$CUR"/expected "$TMP"/output0 6 | diff <(rev "$CUR"/expected) "$TMP"/output1 7 | rm "$TMP"/output0 "$TMP"/output1 8 | diff <("$BIN"/dedupe -f 2 -d " " <"$CUR"/columns) "$CUR"/columns.out 9 | -------------------------------------------------------------------------------- /preprocess/tests/foldfilter/input: -------------------------------------------------------------------------------- 1 | ../../../COPYING -------------------------------------------------------------------------------- /preprocess/tests/foldfilter/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . "$(dirname "$0")"/../vars 3 | #GPL has short columns 4 | diff <("$BIN/foldfilter" cat <"$CUR"/input) "$CUR"/input 5 | diff <("$BIN/foldfilter" -w 10 cat <"$CUR"/input) "$CUR"/input 6 | "$BIN/foldfilter" -w 10 tee "$TMP/fold10" <"$CUR"/input >/dev/null 7 | # Line breaks are not great with leading space but it does work 8 | diff "$TMP/fold10" "$CUR/fold10.expected" 9 | rm "$TMP/fold10" 10 | -------------------------------------------------------------------------------- /preprocess/tests/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CURRENT="$(dirname "$0")" 3 | set -eo pipefail 4 | for i in "$CURRENT"/*/; do 5 | "${i}"run.sh || echo "FAILURE: ${i}" 1>&2 6 | done 7 | -------------------------------------------------------------------------------- /preprocess/tests/shard/input: -------------------------------------------------------------------------------- 1 | ../../../README.md -------------------------------------------------------------------------------- /preprocess/tests/shard/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . "$(dirname "$0")"/../vars 3 | "$BIN/shard" "$TMP"/test_a "$TMP"/test_b <"$CUR"/input 4 | diff <(sort "$TMP"/test_a "$TMP"/test_b) <(sort "$CUR"/input) 5 | "$BIN/shard" --prefix "$TMP"/test --number 4 <"$CUR"/input 6 | diff <(sort "$TMP"/test{0,1,2,3}) <(sort "$CUR"/input) 7 | "$BIN/shard" --prefix "$TMP"/test -c gzip --number 4 <"$CUR"/input 8 | diff <(zcat "$TMP"/test{0,1,2,3} |sort) <(sort "$CUR"/input) 9 | "$BIN/shard" --prefix "$TMP"/test -c bzip2 --number 4 <"$CUR"/input 10 | diff <(bzcat "$TMP"/test{0,1,2,3} |sort) <(sort "$CUR"/input) 11 | rm "$TMP"/test_a "$TMP"/test_b "$TMP"/test{0,1,2,3} 12 | -------------------------------------------------------------------------------- /preprocess/tests/vars: -------------------------------------------------------------------------------- 1 | set -eo pipefail 2 | CUR="$(dirname "$0")" 3 | BIN="${BIN:-"$CUR"/../../../build/bin}" 4 | TMP="$CUR" 5 | -------------------------------------------------------------------------------- /preprocess/text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -o pipefail 3 | BINDIR="$(dirname "$0")" 4 | #Argument 1 is language, argument 2 is lowercase (1) or not (0) 5 | l="$1" 6 | if [ ${#l} != 2 ]; then 7 | echo "Usage: \"$0 language lowercase\" where lowercase is 0 or 1." 1>&2 8 | exit 1 9 | fi 10 | if [ "$2" != 1 ] && [ "$2" != 0 ]; then 11 | echo "Second argument (lowercase) should be 0 or 1" 1>&2 12 | exit 1 13 | fi 14 | #If statement hack to only run process unicode if lowercasing. 15 | "$BINDIR"/process_unicode --language $l --flatten --normalize |"$BINDIR"/../moses/tokenizer/tokenizer.perl -l $l | "$BINDIR"/heuristics.perl -l $l | if [ "$2" == 1 ]; then 16 | "$BINDIR"/../moses/tokenizer/normalize-punctuation.perl $l | "$BINDIR"/process_unicode --language $l --lower 17 | else 18 | "$BINDIR"/../moses/tokenizer/normalize-punctuation.perl $l 19 | fi 20 | -------------------------------------------------------------------------------- /preprocess/train_case_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_stream.hh" 2 | #include "util/file_piece.hh" 3 | #include "util/murmur_hash.hh" 4 | #include "util/mutable_vocab.hh" 5 | #include "util/tokenize_piece.hh" 6 | #include "util/utf8.hh" 7 | #include "util/utf8_icu.hh" 8 | 9 | #include 10 | 11 | #include 12 | 13 | namespace { 14 | void SplitLine(util::FilePiece &from, std::vector &to) { 15 | to.clear(); 16 | for (util::TokenIter i(from.ReadLine(), ' '); i; ++i) { 17 | to.push_back(*i); 18 | } 19 | } 20 | 21 | class Recorder { 22 | public: 23 | void Add(util::StringPiece source, util::StringPiece target) { 24 | util::ToLower(target, lowered_); 25 | uint64_t key = util::MurmurHash64A(lowered_.data(), lowered_.size(), util::MurmurHash64A(source.data(), source.size())); 26 | ++map_[key][vocab_.FindOrInsert(target)]; 27 | } 28 | 29 | void Dump() { 30 | util::FileStream out(1); 31 | for (Map::const_iterator i = map_.begin(); i != map_.end(); ++i) { 32 | out << boost::lexical_cast(i->first); 33 | for (std::unordered_map::const_iterator j = i->second.begin(); j != i->second.end(); ++j) { 34 | out << '\t' << vocab_.String(j->first) << ' ' << j->second; 35 | } 36 | out << '\n'; 37 | } 38 | } 39 | 40 | private: 41 | util::MutableVocab vocab_; 42 | 43 | std::string lowered_; 44 | 45 | // map_[hash(lowered_target, hash(cased_source))][cased_target] = count(cased_source, cased_target) 46 | typedef std::unordered_map > Map; 47 | Map map_; 48 | }; 49 | 50 | } // namespace 51 | 52 | int main(int argc, char *argv[]) { 53 | if (argc != 4) { 54 | std::cerr << "Usage: " << argv[0] << " alignment source target\n"; 55 | return 1; 56 | } 57 | util::FilePiece align(argv[1], &std::cerr), source_file(argv[2]), target_file(argv[3]); 58 | std::vector source_words, target_words; 59 | Recorder recorder; 60 | std::size_t sentence = 0, discarded = 0; 61 | for (; ; ++sentence) { 62 | try { 63 | SplitLine(source_file, source_words); 64 | } catch (const util::EndOfFileException &e) { break; } 65 | SplitLine(target_file, target_words); 66 | // parse comment lone 67 | // "# sentence pair (0) source length" 68 | for (unsigned int i = 0; i < 6; ++i) { 69 | align.ReadDelimited(); 70 | } 71 | unsigned long from_length = align.ReadULong(); 72 | align.ReadDelimited(); align.ReadDelimited(); // target length 73 | unsigned long to_length = align.ReadULong(); 74 | align.ReadLine(); // comment line ending 75 | 76 | align.ReadLine(); // uncased sentence 77 | util::StringPiece word(align.ReadDelimited()); 78 | UTIL_THROW_IF2("NULL" != word, "Expected NULL at the beginning, not " << word); 79 | 80 | if (from_length != source_words.size() || to_length != target_words.size()) { 81 | align.ReadLine(); // Complete line. 82 | ++discarded; 83 | continue; 84 | } 85 | 86 | while ("})" != align.ReadDelimited()) {} 87 | for (unsigned long from = 0; align.ReadWordSameLine(word); ++from) { 88 | align.ReadWordSameLine(word); 89 | UTIL_THROW_IF2(word != "({", "Expected ({ not " << word); 90 | UTIL_THROW_IF2(from >= source_words.size(), "Index " << from << " too high for source text at sentence " << sentence); 91 | for (align.SkipSpaces(); align.peek() != '}'; align.SkipSpaces()) { 92 | unsigned long to = align.ReadULong() - 1 /* NULL word */; 93 | UTIL_THROW_IF2(to >= target_words.size(), "Index " << to << " too high for target text"); 94 | // Throw out beginning of sentence. 95 | if (from != 0 && to != 0) { 96 | recorder.Add(source_words[from], target_words[to]); 97 | } 98 | } 99 | UTIL_THROW_IF2(align.ReadDelimited() != "})", "Expected })"); 100 | } 101 | align.ReadLine(); // Complete line. 102 | } 103 | std::cerr << "Discarded " << discarded << "/" << sentence << std::endl; 104 | recorder.Dump(); 105 | } 106 | -------------------------------------------------------------------------------- /preprocess/unescape_html.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | binmode(STDIN, ":utf8"); 3 | binmode(STDOUT, ":utf8"); 4 | 5 | use HTML::Entities; 6 | use utf8; 7 | 8 | while() { 9 | $str = decode_entities($_); 10 | $str =~ s// /g; 11 | print $str; 12 | } 13 | -------------------------------------------------------------------------------- /preprocess/vocab_main.cc: -------------------------------------------------------------------------------- 1 | #include "util/file_piece.hh" 2 | #include "util/file_stream.hh" 3 | #include "util/murmur_hash.hh" 4 | #include "util/probing_hash_table.hh" 5 | 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | 12 | struct Entry { 13 | typedef uint64_t Key; 14 | uint64_t key; 15 | uint64_t GetKey() const { return key; } 16 | void SetKey(uint64_t to) { key = to; } 17 | }; 18 | 19 | 20 | int main() { 21 | bool delimiters[256]; 22 | memset(delimiters, 0, sizeof(delimiters)); 23 | delimiters['\0'] = true; 24 | delimiters['\t'] = true; 25 | delimiters['\r'] = true; 26 | delimiters['\n'] = true; 27 | delimiters[' '] = true; 28 | 29 | util::AutoProbing seen; 30 | 31 | util::FilePiece in(0, "stdin", &std::cerr); 32 | util::FileStream out(1); 33 | 34 | util::AutoProbing::MutableIterator it; 35 | Entry entry; 36 | 37 | try { while (true) { 38 | util::StringPiece word = in.ReadDelimited(delimiters); 39 | entry.SetKey(util::MurmurHashNative(word.data(), word.size())); 40 | if (!seen.FindOrInsert(entry, it)) { 41 | out << word << '\0'; 42 | } 43 | } } catch (const util::EndOfFileException &e) {} 44 | } 45 | -------------------------------------------------------------------------------- /preprocess/warc.cc: -------------------------------------------------------------------------------- 1 | #include "preprocess/warc.hh" 2 | 3 | #include "util/exception.hh" 4 | #include "util/file.hh" 5 | #include "util/compress.hh" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace preprocess { 13 | 14 | bool ReadMore(util::ReadCompressed &reader, std::string &out) { 15 | const std::size_t kRead = 4096; 16 | std::size_t had = out.size(); 17 | out.resize(out.size() + kRead); 18 | std::size_t got = reader.Read(&out[had], out.size() - had); 19 | if (!got) { 20 | // End of file 21 | UTIL_THROW_IF(had, util::EndOfFileException, "Unexpected end of file inside header"); 22 | return false; 23 | } 24 | out.resize(had + got); 25 | return true; 26 | } 27 | 28 | class HeaderReader { 29 | public: 30 | HeaderReader(util::ReadCompressed &reader, std::string &out) 31 | : reader_(reader), out_(out), consumed_(0) {} 32 | 33 | bool Line(util::StringPiece &line) { 34 | std::size_t newline_start = consumed_; 35 | std::size_t newline; 36 | while (std::string::npos == (newline = out_.find('\n', newline_start))) { 37 | newline_start = out_.size(); 38 | if (!ReadMore(reader_, out_)) return false; 39 | } 40 | // The line is [consumed, newline). A blank line indicates header end. 41 | line = util::StringPiece(out_.data() + consumed_, newline - consumed_); 42 | // Remove carriage return if present. 43 | if (!line.empty() && line.data()[line.size() - 1] == '\r') { 44 | line = util::StringPiece(line.data(), line.size() - 1); 45 | } 46 | consumed_ = newline + 1; 47 | return true; 48 | } 49 | 50 | std::size_t Consumed() const { return consumed_; } 51 | 52 | private: 53 | util::ReadCompressed &reader_; 54 | std::string &out_; 55 | 56 | std::size_t consumed_; 57 | }; 58 | 59 | bool WARCReader::Read(std::string &out) { 60 | std::swap(overhang_, out); 61 | overhang_.clear(); 62 | out.reserve(32768); 63 | HeaderReader header(reader_, out); 64 | util::StringPiece line; 65 | if (!header.Line(line)) return false; 66 | UTIL_THROW_IF(line != "WARC/1.0", util::Exception, "Expected WARC/1.0 header but got `" << line << '\''); 67 | std::size_t length = 0; 68 | bool seen_content_length = false; 69 | const char kContentLength[] = "Content-Length:"; 70 | const std::size_t kContentLengthLength = sizeof(kContentLength) - 1; 71 | while (!line.empty()) { 72 | UTIL_THROW_IF(!header.Line(line), util::EndOfFileException, "WARC ended in header."); 73 | if (line.size() >= kContentLengthLength && !strncasecmp(line.data(), kContentLength, kContentLengthLength)) { 74 | UTIL_THROW_IF2(seen_content_length, "Two Content-Length headers?"); 75 | seen_content_length = true; 76 | char *end; 77 | length = std::strtoll(line.data() + kContentLengthLength, &end, 10); 78 | // TODO: tolerate whitespace? 79 | UTIL_THROW_IF2(end != line.data() + line.size(), "Content-Length parse error in `" << line << '\''); 80 | } 81 | } 82 | UTIL_THROW_IF2(!seen_content_length, "No Content-Length: header in " << out); 83 | std::size_t total_length = header.Consumed() + length + 4 /* CRLF CRLF after data as specified in the standard. */; 84 | 85 | if (total_length < out.size()) { 86 | overhang_.assign(out.data() + total_length, out.size() - total_length); 87 | out.resize(total_length); 88 | } else { 89 | std::size_t start = out.size(); 90 | out.resize(total_length); 91 | while (start != out.size()) { 92 | std::size_t got = reader_.Read(&out[start], out.size() - start); 93 | UTIL_THROW_IF(!got, util::EndOfFileException, "Unexpected end of file while reading content of length " << length); 94 | start += got; 95 | } 96 | } 97 | // Check CRLF CRLF. 98 | UTIL_THROW_IF2(util::StringPiece(out.data() + out.size() - 4, 4) != util::StringPiece("\r\n\r\n", 4), "End of WARC record missing CRLF CRLF"); 99 | return true; 100 | } 101 | 102 | } // namespace preprocess 103 | -------------------------------------------------------------------------------- /preprocess/warc.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/compress.hh" 4 | 5 | #include 6 | 7 | namespace preprocess { 8 | 9 | class WARCReader { 10 | public: 11 | explicit WARCReader(int fd) : reader_(fd) {} 12 | 13 | bool Read(std::string &out); 14 | 15 | private: 16 | util::ReadCompressed reader_; 17 | 18 | std::string overhang_; 19 | }; 20 | 21 | } // namespace preprocess 22 | -------------------------------------------------------------------------------- /util/buffered_stream.hh: -------------------------------------------------------------------------------- 1 | /* A buffered output stream. 2 | * The Writer class has this interface. 3 | * class Writer { 4 | * private: 5 | * void write(const void *data, size_t amount); 6 | * void flush(); 7 | * }; 8 | */ 9 | #ifndef UTIL_BUFFERED_STREAM_H 10 | #define UTIL_BUFFERED_STREAM_H 11 | 12 | #include "util/fake_ostream.hh" 13 | #include "util/file.hh" 14 | #include "util/scoped.hh" 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | namespace util { 22 | 23 | template class BufferedStream : public FakeOStream > { 24 | public: 25 | const std::size_t kBufferSize = std::max(8192, kToStringMaxBytes); 26 | template explicit BufferedStream(Args&&... args) 27 | : buf_(kBufferSize), 28 | current_(static_cast(buf_.get())), 29 | end_(current_ + kBufferSize), 30 | writer_(std::forward(args)...) {} 31 | 32 | /* The source of the move is left in an unusable state that can only be destroyed. */ 33 | #if __cplusplus >= 201103L 34 | BufferedStream(BufferedStream &&from) noexcept : buf_(std::move(from.buf_)), current_(from.current_), end_(from.end_) { 35 | from.end_ = reinterpret_cast(from.buf_.get()); 36 | from.current_ = from.end_; 37 | } 38 | #endif 39 | 40 | ~BufferedStream() { 41 | flush(); 42 | } 43 | 44 | BufferedStream &flush() { 45 | SpillBuffer(); 46 | writer_.flush(); 47 | return *this; 48 | } 49 | 50 | // For writes of arbitrary size. 51 | BufferedStream &write(const void *data, std::size_t length) { 52 | if (UTIL_LIKELY(current_ + length <= end_)) { 53 | std::memcpy(current_, data, length); 54 | current_ += length; 55 | return *this; 56 | } 57 | SpillBuffer(); 58 | if (current_ + length <= end_) { 59 | std::memcpy(current_, data, length); 60 | current_ += length; 61 | } else { 62 | writer_.write(data, length); 63 | } 64 | return *this; 65 | } 66 | 67 | private: 68 | friend class FakeOStream >; 69 | // For writes directly to buffer guaranteed to have amount < buffer size. 70 | char *Ensure(std::size_t amount) { 71 | if (UTIL_UNLIKELY(current_ + amount > end_)) { 72 | SpillBuffer(); 73 | assert(current_ + amount <= end_); 74 | } 75 | return current_; 76 | } 77 | 78 | void AdvanceTo(char *to) { 79 | current_ = to; 80 | assert(current_ <= end_); 81 | } 82 | 83 | void SpillBuffer() { 84 | if (current_ != buf_.get()) { 85 | writer_.write(buf_.get(), current_ - (char*)buf_.get()); 86 | current_ = static_cast(buf_.get()); 87 | } 88 | } 89 | 90 | util::scoped_malloc buf_; 91 | char *current_, *end_; 92 | Writer writer_; 93 | }; 94 | 95 | } // namespace util 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /util/cat_compressed_main.cc: -------------------------------------------------------------------------------- 1 | // Like cat but interprets compressed files. 2 | #include "util/file.hh" 3 | #include "util/read_compressed.hh" 4 | 5 | #include 6 | #include 7 | 8 | namespace { 9 | const std::size_t kBufSize = 16384; 10 | void Copy(util::ReadCompressed &from, int to) { 11 | util::scoped_malloc buffer(util::MallocOrThrow(kBufSize)); 12 | while (std::size_t amount = from.Read(buffer.get(), kBufSize)) { 13 | util::WriteOrThrow(to, buffer.get(), amount); 14 | } 15 | } 16 | } // namespace 17 | 18 | int main(int argc, char *argv[]) { 19 | // Lane Schwartz likes -h and --help 20 | for (int i = 1; i < argc; ++i) { 21 | char *arg = argv[i]; 22 | if (!strcmp(arg, "--")) break; 23 | if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { 24 | std::cerr << 25 | "A cat implementation that interprets compressed files.\n" 26 | "Usage: " << argv[0] << " [file1] [file2] ...\n" 27 | "If no file is provided, then stdin is read.\n"; 28 | return 1; 29 | } 30 | } 31 | 32 | try { 33 | if (argc == 1) { 34 | util::ReadCompressed in(0); 35 | Copy(in, 1); 36 | } else { 37 | for (int i = 1; i < argc; ++i) { 38 | util::ReadCompressed in(util::OpenReadOrThrow(argv[i])); 39 | Copy(in, 1); 40 | } 41 | } 42 | } catch (const std::exception &e) { 43 | std::cerr << e.what() << std::endl; 44 | return 2; 45 | } 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /util/compress.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_COMPRESS_H 2 | #define UTIL_COMPRESS_H 3 | 4 | #include "util/exception.hh" 5 | #include "util/file.hh" 6 | #include "util/scoped.hh" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace util { 13 | 14 | class CompressedException : public Exception { 15 | public: 16 | CompressedException() throw(); 17 | virtual ~CompressedException() throw(); 18 | }; 19 | 20 | class GZException : public CompressedException { 21 | public: 22 | GZException() throw(); 23 | ~GZException() throw(); 24 | }; 25 | 26 | class BZException : public CompressedException { 27 | public: 28 | BZException() throw(); 29 | ~BZException() throw(); 30 | }; 31 | 32 | class XZException : public CompressedException { 33 | public: 34 | XZException() throw(); 35 | ~XZException() throw(); 36 | }; 37 | 38 | class ReadCompressed; 39 | 40 | class ReadBase { 41 | public: 42 | virtual ~ReadBase() {} 43 | 44 | virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0; 45 | 46 | protected: 47 | static void ReplaceThis(ReadBase *with, ReadCompressed &thunk); 48 | 49 | ReadBase *Current(ReadCompressed &thunk); 50 | 51 | static uint64_t &ReadCount(ReadCompressed &thunk); 52 | }; 53 | 54 | class ReadCompressed { 55 | public: 56 | static const std::size_t kMagicSize = 6; 57 | // Must have at least kMagicSize bytes. 58 | static bool DetectCompressedMagic(const void *from); 59 | 60 | // Takes ownership of fd. 61 | explicit ReadCompressed(int fd); 62 | 63 | // Try to avoid using this. Use the fd instead. 64 | // There is no decompression support for istreams. 65 | explicit ReadCompressed(std::istream &in); 66 | 67 | // Must call Reset later. 68 | ReadCompressed(); 69 | 70 | // Takes ownership of fd. 71 | void Reset(int fd); 72 | 73 | // Same advice as the constructor. 74 | void Reset(std::istream &in); 75 | 76 | std::size_t Read(void *to, std::size_t amount); 77 | 78 | // Repeatedly call read to fill a buffer unless EOF is hit. 79 | // Return number of bytes read. 80 | std::size_t ReadOrEOF(void *const to, std::size_t amount); 81 | 82 | uint64_t RawAmount() const { return raw_amount_; } 83 | 84 | private: 85 | friend class ReadBase; 86 | 87 | scoped_ptr internal_; 88 | 89 | uint64_t raw_amount_; 90 | }; 91 | 92 | class WriteBase { 93 | public: 94 | virtual ~WriteBase(); 95 | 96 | virtual void write(const void *data, std::size_t amount) = 0; 97 | 98 | virtual void flush() = 0; 99 | 100 | protected: 101 | WriteBase(); 102 | }; 103 | 104 | /* Currently xzip is missing */ 105 | class WriteCompressed { 106 | public: 107 | enum Compression { NONE, GZIP, BZIP, XZIP }; 108 | // Takes ownership of fd. 109 | explicit WriteCompressed(int fd, Compression compression); 110 | 111 | ~WriteCompressed(); 112 | 113 | void write(const void *data, std::size_t amount); 114 | 115 | void flush(); 116 | 117 | private: 118 | scoped_ptr backend_; 119 | }; 120 | 121 | // Very basic gzip compression support. Normally this would involve streams 122 | // but I needed the compression in the thread with fused output. 123 | void GZCompress(StringPiece from, std::string &to, int level = 9); 124 | 125 | } // namespace util 126 | 127 | #endif // UTIL_COMPRESS_H 128 | -------------------------------------------------------------------------------- /util/double-conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This CMake file was created by Lane Schwartz 2 | 3 | # Explicitly list the source files for this subdirectory 4 | # 5 | # If you add any source files to this subdirectory 6 | # that should be included in the kenlm library, 7 | # (this excludes any unit test files) 8 | # you should add them to the following list: 9 | # 10 | # In order to allow CMake files in the parent directory 11 | # to see this variable definition, we set PARENT_SCOPE. 12 | # 13 | # In order to set correct paths to these files 14 | # when this variable is referenced by CMake files in the parent directory, 15 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. 16 | # 17 | set(PREPROCESS_UTIL_DOUBLECONVERSION_SOURCE 18 | ${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc 19 | ${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc 20 | ${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc 21 | ${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc 22 | ${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc 23 | ${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc 24 | ${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc 25 | ${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc 26 | PARENT_SCOPE) 27 | 28 | -------------------------------------------------------------------------------- /util/double-conversion/Jamfile: -------------------------------------------------------------------------------- 1 | fakelib double-conversion : [ glob *.cc ] : : : . ; 2 | -------------------------------------------------------------------------------- /util/double-conversion/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2006-2011, the V8 project authors. All rights reserved. 2 | Redistribution and use in source and binary forms, with or without 3 | modification, are permitted provided that the following conditions are 4 | met: 5 | 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above 9 | copyright notice, this list of conditions and the following 10 | disclaimer in the documentation and/or other materials provided 11 | with the distribution. 12 | * Neither the name of Google Inc. nor the names of its 13 | contributors may be used to endorse or promote products derived 14 | from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /util/double-conversion/bignum-dtoa.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_ 29 | #define DOUBLE_CONVERSION_BIGNUM_DTOA_H_ 30 | 31 | #include "utils.h" 32 | 33 | namespace double_conversion { 34 | 35 | enum BignumDtoaMode { 36 | // Return the shortest correct representation. 37 | // For example the output of 0.299999999999999988897 is (the less accurate but 38 | // correct) 0.3. 39 | BIGNUM_DTOA_SHORTEST, 40 | // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats. 41 | BIGNUM_DTOA_SHORTEST_SINGLE, 42 | // Return a fixed number of digits after the decimal point. 43 | // For instance fixed(0.1, 4) becomes 0.1000 44 | // If the input number is big, the output will be big. 45 | BIGNUM_DTOA_FIXED, 46 | // Return a fixed number of digits, no matter what the exponent is. 47 | BIGNUM_DTOA_PRECISION 48 | }; 49 | 50 | // Converts the given double 'v' to ascii. 51 | // The result should be interpreted as buffer * 10^(point-length). 52 | // The buffer will be null-terminated. 53 | // 54 | // The input v must be > 0 and different from NaN, and Infinity. 55 | // 56 | // The output depends on the given mode: 57 | // - SHORTEST: produce the least amount of digits for which the internal 58 | // identity requirement is still satisfied. If the digits are printed 59 | // (together with the correct exponent) then reading this number will give 60 | // 'v' again. The buffer will choose the representation that is closest to 61 | // 'v'. If there are two at the same distance, than the number is round up. 62 | // In this mode the 'requested_digits' parameter is ignored. 63 | // - FIXED: produces digits necessary to print a given number with 64 | // 'requested_digits' digits after the decimal point. The produced digits 65 | // might be too short in which case the caller has to fill the gaps with '0's. 66 | // Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2. 67 | // Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns 68 | // buffer="2", point=0. 69 | // Note: the length of the returned buffer has no meaning wrt the significance 70 | // of its digits. That is, just because it contains '0's does not mean that 71 | // any other digit would not satisfy the internal identity requirement. 72 | // - PRECISION: produces 'requested_digits' where the first digit is not '0'. 73 | // Even though the length of produced digits usually equals 74 | // 'requested_digits', the function is allowed to return fewer digits, in 75 | // which case the caller has to fill the missing digits with '0's. 76 | // Halfway cases are again rounded up. 77 | // 'BignumDtoa' expects the given buffer to be big enough to hold all digits 78 | // and a terminating null-character. 79 | void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, 80 | Vector buffer, int* length, int* point); 81 | 82 | } // namespace double_conversion 83 | 84 | #endif // DOUBLE_CONVERSION_BIGNUM_DTOA_H_ 85 | -------------------------------------------------------------------------------- /util/double-conversion/cached-powers.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_CACHED_POWERS_H_ 29 | #define DOUBLE_CONVERSION_CACHED_POWERS_H_ 30 | 31 | #include "diy-fp.h" 32 | 33 | namespace double_conversion { 34 | 35 | class PowersOfTenCache { 36 | public: 37 | 38 | // Not all powers of ten are cached. The decimal exponent of two neighboring 39 | // cached numbers will differ by kDecimalExponentDistance. 40 | static const int kDecimalExponentDistance; 41 | 42 | static const int kMinDecimalExponent; 43 | static const int kMaxDecimalExponent; 44 | 45 | // Returns a cached power-of-ten with a binary exponent in the range 46 | // [min_exponent; max_exponent] (boundaries included). 47 | static void GetCachedPowerForBinaryExponentRange(int min_exponent, 48 | int max_exponent, 49 | DiyFp* power, 50 | int* decimal_exponent); 51 | 52 | // Returns a cached power of ten x ~= 10^k such that 53 | // k <= decimal_exponent < k + kCachedPowersDecimalDistance. 54 | // The given decimal_exponent must satisfy 55 | // kMinDecimalExponent <= requested_exponent, and 56 | // requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance. 57 | static void GetCachedPowerForDecimalExponent(int requested_exponent, 58 | DiyFp* power, 59 | int* found_exponent); 60 | }; 61 | 62 | } // namespace double_conversion 63 | 64 | #endif // DOUBLE_CONVERSION_CACHED_POWERS_H_ 65 | -------------------------------------------------------------------------------- /util/double-conversion/diy-fp.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | 29 | #include "diy-fp.h" 30 | #include "utils.h" 31 | 32 | namespace double_conversion { 33 | 34 | void DiyFp::Multiply(const DiyFp& other) { 35 | // Simply "emulates" a 128 bit multiplication. 36 | // However: the resulting number only contains 64 bits. The least 37 | // significant 64 bits are only used for rounding the most significant 64 38 | // bits. 39 | const uint64_t kM32 = 0xFFFFFFFFU; 40 | uint64_t a = f_ >> 32; 41 | uint64_t b = f_ & kM32; 42 | uint64_t c = other.f_ >> 32; 43 | uint64_t d = other.f_ & kM32; 44 | uint64_t ac = a * c; 45 | uint64_t bc = b * c; 46 | uint64_t ad = a * d; 47 | uint64_t bd = b * d; 48 | uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32); 49 | // By adding 1U << 31 to tmp we round the final result. 50 | // Halfway cases will be round up. 51 | tmp += 1U << 31; 52 | uint64_t result_f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32); 53 | e_ += other.e_ + 64; 54 | f_ = result_f; 55 | } 56 | 57 | } // namespace double_conversion 58 | -------------------------------------------------------------------------------- /util/double-conversion/diy-fp.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_DIY_FP_H_ 29 | #define DOUBLE_CONVERSION_DIY_FP_H_ 30 | 31 | #include "utils.h" 32 | 33 | namespace double_conversion { 34 | 35 | // This "Do It Yourself Floating Point" class implements a floating-point number 36 | // with a uint64 significand and an int exponent. Normalized DiyFp numbers will 37 | // have the most significant bit of the significand set. 38 | // Multiplication and Subtraction do not normalize their results. 39 | // DiyFp are not designed to contain special doubles (NaN and Infinity). 40 | class DiyFp { 41 | public: 42 | static const int kSignificandSize = 64; 43 | 44 | DiyFp() : f_(0), e_(0) {} 45 | DiyFp(uint64_t f, int e) : f_(f), e_(e) {} 46 | 47 | // this = this - other. 48 | // The exponents of both numbers must be the same and the significand of this 49 | // must be bigger than the significand of other. 50 | // The result will not be normalized. 51 | void Subtract(const DiyFp& other) { 52 | ASSERT(e_ == other.e_); 53 | ASSERT(f_ >= other.f_); 54 | f_ -= other.f_; 55 | } 56 | 57 | // Returns a - b. 58 | // The exponents of both numbers must be the same and this must be bigger 59 | // than other. The result will not be normalized. 60 | static DiyFp Minus(const DiyFp& a, const DiyFp& b) { 61 | DiyFp result = a; 62 | result.Subtract(b); 63 | return result; 64 | } 65 | 66 | 67 | // this = this * other. 68 | void Multiply(const DiyFp& other); 69 | 70 | // returns a * b; 71 | static DiyFp Times(const DiyFp& a, const DiyFp& b) { 72 | DiyFp result = a; 73 | result.Multiply(b); 74 | return result; 75 | } 76 | 77 | void Normalize() { 78 | ASSERT(f_ != 0); 79 | uint64_t f = f_; 80 | int e = e_; 81 | 82 | // This method is mainly called for normalizing boundaries. In general 83 | // boundaries need to be shifted by 10 bits. We thus optimize for this case. 84 | const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000); 85 | while ((f & k10MSBits) == 0) { 86 | f <<= 10; 87 | e -= 10; 88 | } 89 | while ((f & kUint64MSB) == 0) { 90 | f <<= 1; 91 | e--; 92 | } 93 | f_ = f; 94 | e_ = e; 95 | } 96 | 97 | static DiyFp Normalize(const DiyFp& a) { 98 | DiyFp result = a; 99 | result.Normalize(); 100 | return result; 101 | } 102 | 103 | uint64_t f() const { return f_; } 104 | int e() const { return e_; } 105 | 106 | void set_f(uint64_t new_value) { f_ = new_value; } 107 | void set_e(int new_value) { e_ = new_value; } 108 | 109 | private: 110 | static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000); 111 | 112 | uint64_t f_; 113 | int e_; 114 | }; 115 | 116 | } // namespace double_conversion 117 | 118 | #endif // DOUBLE_CONVERSION_DIY_FP_H_ 119 | -------------------------------------------------------------------------------- /util/double-conversion/fast-dtoa.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_FAST_DTOA_H_ 29 | #define DOUBLE_CONVERSION_FAST_DTOA_H_ 30 | 31 | #include "utils.h" 32 | 33 | namespace double_conversion { 34 | 35 | enum FastDtoaMode { 36 | // Computes the shortest representation of the given input. The returned 37 | // result will be the most accurate number of this length. Longer 38 | // representations might be more accurate. 39 | FAST_DTOA_SHORTEST, 40 | // Same as FAST_DTOA_SHORTEST but for single-precision floats. 41 | FAST_DTOA_SHORTEST_SINGLE, 42 | // Computes a representation where the precision (number of digits) is 43 | // given as input. The precision is independent of the decimal point. 44 | FAST_DTOA_PRECISION 45 | }; 46 | 47 | // FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not 48 | // include the terminating '\0' character. 49 | static const int kFastDtoaMaximalLength = 17; 50 | // Same for single-precision numbers. 51 | static const int kFastDtoaMaximalSingleLength = 9; 52 | 53 | // Provides a decimal representation of v. 54 | // The result should be interpreted as buffer * 10^(point - length). 55 | // 56 | // Precondition: 57 | // * v must be a strictly positive finite double. 58 | // 59 | // Returns true if it succeeds, otherwise the result can not be trusted. 60 | // There will be *length digits inside the buffer followed by a null terminator. 61 | // If the function returns true and mode equals 62 | // - FAST_DTOA_SHORTEST, then 63 | // the parameter requested_digits is ignored. 64 | // The result satisfies 65 | // v == (double) (buffer * 10^(point - length)). 66 | // The digits in the buffer are the shortest representation possible. E.g. 67 | // if 0.099999999999 and 0.1 represent the same double then "1" is returned 68 | // with point = 0. 69 | // The last digit will be closest to the actual v. That is, even if several 70 | // digits might correctly yield 'v' when read again, the buffer will contain 71 | // the one closest to v. 72 | // - FAST_DTOA_PRECISION, then 73 | // the buffer contains requested_digits digits. 74 | // the difference v - (buffer * 10^(point-length)) is closest to zero for 75 | // all possible representations of requested_digits digits. 76 | // If there are two values that are equally close, then FastDtoa returns 77 | // false. 78 | // For both modes the buffer must be large enough to hold the result. 79 | bool FastDtoa(double d, 80 | FastDtoaMode mode, 81 | int requested_digits, 82 | Vector buffer, 83 | int* length, 84 | int* decimal_point); 85 | 86 | } // namespace double_conversion 87 | 88 | #endif // DOUBLE_CONVERSION_FAST_DTOA_H_ 89 | -------------------------------------------------------------------------------- /util/double-conversion/fixed-dtoa.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_FIXED_DTOA_H_ 29 | #define DOUBLE_CONVERSION_FIXED_DTOA_H_ 30 | 31 | #include "utils.h" 32 | 33 | namespace double_conversion { 34 | 35 | // Produces digits necessary to print a given number with 36 | // 'fractional_count' digits after the decimal point. 37 | // The buffer must be big enough to hold the result plus one terminating null 38 | // character. 39 | // 40 | // The produced digits might be too short in which case the caller has to fill 41 | // the gaps with '0's. 42 | // Example: FastFixedDtoa(0.001, 5, ...) is allowed to return buffer = "1", and 43 | // decimal_point = -2. 44 | // Halfway cases are rounded towards +/-Infinity (away from 0). The call 45 | // FastFixedDtoa(0.15, 2, ...) thus returns buffer = "2", decimal_point = 0. 46 | // The returned buffer may contain digits that would be truncated from the 47 | // shortest representation of the input. 48 | // 49 | // This method only works for some parameters. If it can't handle the input it 50 | // returns false. The output is null-terminated when the function succeeds. 51 | bool FastFixedDtoa(double v, int fractional_count, 52 | Vector buffer, int* length, int* decimal_point); 53 | 54 | } // namespace double_conversion 55 | 56 | #endif // DOUBLE_CONVERSION_FIXED_DTOA_H_ 57 | -------------------------------------------------------------------------------- /util/double-conversion/strtod.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_STRTOD_H_ 29 | #define DOUBLE_CONVERSION_STRTOD_H_ 30 | 31 | #include "utils.h" 32 | 33 | namespace double_conversion { 34 | 35 | // The buffer must only contain digits in the range [0-9]. It must not 36 | // contain a dot or a sign. It must not start with '0', and must not be empty. 37 | double Strtod(Vector buffer, int exponent); 38 | 39 | // The buffer must only contain digits in the range [0-9]. It must not 40 | // contain a dot or a sign. It must not start with '0', and must not be empty. 41 | float Strtof(Vector buffer, int exponent); 42 | 43 | } // namespace double_conversion 44 | 45 | #endif // DOUBLE_CONVERSION_STRTOD_H_ 46 | -------------------------------------------------------------------------------- /util/ersatz_progress.cc: -------------------------------------------------------------------------------- 1 | #include "util/ersatz_progress.hh" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace util { 9 | 10 | namespace { const unsigned char kWidth = 100; } 11 | 12 | const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; 13 | 14 | ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} 15 | 16 | ErsatzProgress::~ErsatzProgress() { 17 | if (out_) Finished(); 18 | } 19 | 20 | ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message) 21 | : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { 22 | if (!out_) { 23 | next_ = std::numeric_limits::max(); 24 | return; 25 | } 26 | if (!message.empty()) *out_ << message << '\n'; 27 | *out_ << kProgressBanner; 28 | } 29 | 30 | void ErsatzProgress::Milestone() { 31 | if (!out_) { current_ = 0; return; } 32 | if (!complete_) return; 33 | unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_); 34 | 35 | for (; stones_written_ < stone; ++stones_written_) { 36 | (*out_) << '*'; 37 | } 38 | if (stone == kWidth) { 39 | (*out_) << std::endl; 40 | next_ = std::numeric_limits::max(); 41 | out_ = NULL; 42 | } else { 43 | next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth); 44 | } 45 | } 46 | 47 | } // namespace util 48 | -------------------------------------------------------------------------------- /util/ersatz_progress.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_ERSATZ_PROGRESS_H 2 | #define UTIL_ERSATZ_PROGRESS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Ersatz version of boost::progress so core language model doesn't depend on 9 | // boost. Also adds option to print nothing. 10 | 11 | namespace util { 12 | 13 | extern const char kProgressBanner[]; 14 | 15 | class ErsatzProgress { 16 | public: 17 | // No output. 18 | ErsatzProgress(); 19 | 20 | // Null means no output. The null value is useful for passing along the ostream pointer from another caller. 21 | explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); 22 | 23 | #if __cplusplus >= 201103L 24 | ErsatzProgress(ErsatzProgress &&from) noexcept : current_(from.current_), next_(from.next_), complete_(from.complete_), stones_written_(from.stones_written_), out_(from.out_) { 25 | from.out_ = nullptr; 26 | from.next_ = (uint64_t)-1; 27 | } 28 | #endif 29 | 30 | ~ErsatzProgress(); 31 | 32 | ErsatzProgress &operator++() { 33 | if (++current_ >= next_) Milestone(); 34 | return *this; 35 | } 36 | 37 | ErsatzProgress &operator+=(uint64_t amount) { 38 | if ((current_ += amount) >= next_) Milestone(); 39 | return *this; 40 | } 41 | 42 | void Set(uint64_t to) { 43 | if ((current_ = to) >= next_) Milestone(); 44 | } 45 | 46 | void Finished() { 47 | Set(complete_); 48 | } 49 | 50 | private: 51 | void Milestone(); 52 | 53 | uint64_t current_, next_, complete_; 54 | unsigned char stones_written_; 55 | std::ostream *out_; 56 | 57 | // noncopyable 58 | ErsatzProgress(const ErsatzProgress &other); 59 | ErsatzProgress &operator=(const ErsatzProgress &other); 60 | }; 61 | 62 | } // namespace util 63 | 64 | #endif // UTIL_ERSATZ_PROGRESS_H 65 | -------------------------------------------------------------------------------- /util/exception.cc: -------------------------------------------------------------------------------- 1 | #include "util/exception.hh" 2 | 3 | #ifdef __GXX_RTTI 4 | #include 5 | #endif 6 | 7 | #include 8 | #include 9 | 10 | #if defined(_WIN32) || defined(_WIN64) 11 | #include 12 | #include 13 | #endif 14 | 15 | namespace util { 16 | 17 | Exception::Exception() throw() {} 18 | Exception::~Exception() throw() {} 19 | 20 | void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) { 21 | /* The child class might have set some text, but we want this to come first. 22 | * Another option would be passing this information to the constructor, but 23 | * then child classes would have to accept constructor arguments and pass 24 | * them down. 25 | */ 26 | std::string old_text; 27 | what_.swap(old_text); 28 | what_ << file << ':' << line; 29 | if (func) what_ << " in " << func << " threw "; 30 | if (child_name) { 31 | what_ << child_name; 32 | } else { 33 | #ifdef __GXX_RTTI 34 | what_ << typeid(this).name(); 35 | #else 36 | what_ << "an exception"; 37 | #endif 38 | } 39 | if (condition) { 40 | what_ << " because `" << condition << '\''; 41 | } 42 | what_ << ".\n"; 43 | what_ << old_text; 44 | } 45 | 46 | namespace { 47 | 48 | #ifdef __GNUC__ 49 | const char *HandleStrerror(int ret, const char *buf) __attribute__ ((unused)); 50 | const char *HandleStrerror(const char *ret, const char * /*buf*/) __attribute__ ((unused)); 51 | #endif 52 | // At least one of these functions will not be called. 53 | #ifdef __clang__ 54 | #pragma clang diagnostic push 55 | #pragma clang diagnostic ignored "-Wunused-function" 56 | #endif 57 | // The XOPEN version. 58 | const char *HandleStrerror(int ret, const char *buf) { 59 | if (!ret) return buf; 60 | return NULL; 61 | } 62 | 63 | // The GNU version. 64 | const char *HandleStrerror(const char *ret, const char * /*buf*/) { 65 | return ret; 66 | } 67 | #ifdef __clang__ 68 | #pragma clang diagnostic pop 69 | #endif 70 | } // namespace 71 | 72 | ErrnoException::ErrnoException() throw() : errno_(errno) { 73 | char buf[200]; 74 | buf[0] = 0; 75 | #if defined(sun) || defined(_WIN32) || defined(_WIN64) 76 | const char *add = strerror(errno); 77 | #else 78 | const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); 79 | #endif 80 | 81 | if (add) { 82 | *this << add << ' '; 83 | } 84 | } 85 | 86 | ErrnoException::~ErrnoException() throw() {} 87 | 88 | OverflowException::OverflowException() throw() {} 89 | OverflowException::~OverflowException() throw() {} 90 | 91 | #if defined(_WIN32) || defined(_WIN64) 92 | WindowsException::WindowsException() throw() { 93 | unsigned int last_error = GetLastError(); 94 | char error_msg[256] = ""; 95 | if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) { 96 | *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". "; 97 | } else { 98 | *this << "Windows error " << last_error << ": " << error_msg; 99 | } 100 | } 101 | WindowsException::~WindowsException() throw() {} 102 | #endif 103 | 104 | } // namespace util 105 | -------------------------------------------------------------------------------- /util/fake_ostream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_FAKE_OSTREAM_H 2 | #define UTIL_FAKE_OSTREAM_H 3 | 4 | #include "util/float_to_string.hh" 5 | #include "util/integer_to_string.hh" 6 | #include "util/string_piece.hh" 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | namespace util { 14 | 15 | /* Like std::ostream but without being incredibly slow. 16 | * Supports most of the built-in types except for long double. 17 | * 18 | * The FakeOStream class is intended to be inherited from. The inherting class 19 | * should provide: 20 | * public: 21 | * Derived &flush(); 22 | * Derived &write(const void *data, std::size_t length); 23 | * 24 | * private: or protected: 25 | * friend class FakeOStream; 26 | * char *Ensure(std::size_t amount); 27 | * void AdvanceTo(char *to); 28 | * 29 | * The Ensure function makes enough space for an in-place write and returns 30 | * where to write. The AdvanceTo function happens after the write, saying how 31 | * much was actually written. 32 | * 33 | * Precondition: 34 | * amount <= kToStringMaxBytes for in-place writes. 35 | */ 36 | template class FakeOStream { 37 | public: 38 | FakeOStream() {} 39 | 40 | // This also covers std::string and char* 41 | Derived &operator<<(StringPiece str) { 42 | return C().write(str.data(), str.size()); 43 | } 44 | 45 | // Handle integers by size and signedness. 46 | private: 47 | template struct EnableIfKludge { 48 | typedef Derived type; 49 | }; 50 | template ::is_signed, bool IsInteger = std::numeric_limits::is_integer> struct Coerce {}; 51 | 52 | template struct Coerce { typedef uint16_t To; }; 53 | template struct Coerce { typedef uint32_t To; }; 54 | template struct Coerce { typedef uint64_t To; }; 55 | 56 | template struct Coerce { typedef int16_t To; }; 57 | template struct Coerce { typedef int32_t To; }; 58 | template struct Coerce { typedef int64_t To; }; 59 | public: 60 | template typename EnableIfKludge::To>::type &operator<<(const From value) { 61 | return CallToString(static_cast::To>(value)); 62 | } 63 | 64 | // Character types that get copied as bytes instead of displayed as integers. 65 | Derived &operator<<(char val) { return put(val); } 66 | Derived &operator<<(signed char val) { return put(static_cast(val)); } 67 | Derived &operator<<(unsigned char val) { return put(static_cast(val)); } 68 | 69 | Derived &operator<<(bool val) { return put(val + '0'); } 70 | // enums will fall back to int but are not caught by the template. 71 | Derived &operator<<(int val) { return CallToString(static_cast::To>(val)); } 72 | 73 | Derived &operator<<(float val) { return CallToString(val); } 74 | Derived &operator<<(double val) { return CallToString(val); } 75 | 76 | // This is here to catch all the other pointer types. 77 | Derived &operator<<(const void *value) { return CallToString(value); } 78 | // This is here because the above line also catches const char*. 79 | Derived &operator<<(const char *value) { return *this << StringPiece(value); } 80 | Derived &operator<<(char *value) { return *this << StringPiece(value); } 81 | 82 | Derived &put(char val) { 83 | char *c = C().Ensure(1); 84 | *c = val; 85 | C().AdvanceTo(++c); 86 | return C(); 87 | } 88 | 89 | char widen(char val) const { return val; } 90 | 91 | private: 92 | // References to derived class for convenience. 93 | Derived &C() { 94 | return *static_cast(this); 95 | } 96 | 97 | const Derived &C() const { 98 | return *static_cast(this); 99 | } 100 | 101 | // This is separate to prevent an infinite loop if the compiler considers 102 | // types the same (i.e. gcc std::size_t and uint64_t or uint32_t). 103 | template Derived &CallToString(const T value) { 104 | C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf::kBytes))); 105 | return C(); 106 | } 107 | }; 108 | 109 | } // namespace 110 | 111 | #endif // UTIL_FAKE_OSTREAM_H 112 | -------------------------------------------------------------------------------- /util/file_stream.hh: -------------------------------------------------------------------------------- 1 | /* Like std::ofstream but without being incredibly slow. Backed by a raw fd that it owns. 2 | * Supports most of the built-in types except for long double. 3 | */ 4 | #ifndef UTIL_FILE_STREAM_H 5 | #define UTIL_FILE_STREAM_H 6 | 7 | #include "util/buffered_stream.hh" 8 | #include "util/file.hh" 9 | 10 | #include 11 | 12 | namespace util { 13 | 14 | typedef BufferedStream FileStream; 15 | 16 | } // namespace 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /util/float_to_string.cc: -------------------------------------------------------------------------------- 1 | #include "util/float_to_string.hh" 2 | 3 | #include "util/double-conversion/double-conversion.h" 4 | #include "util/double-conversion/utils.h" 5 | 6 | namespace util { 7 | namespace { 8 | const double_conversion::DoubleToStringConverter kConverter(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0); 9 | } // namespace 10 | 11 | char *ToString(double value, char *to) { 12 | double_conversion::StringBuilder builder(to, ToStringBuf::kBytes); 13 | kConverter.ToShortest(value, &builder); 14 | return &to[builder.position()]; 15 | } 16 | 17 | char *ToString(float value, char *to) { 18 | double_conversion::StringBuilder builder(to, ToStringBuf::kBytes); 19 | kConverter.ToShortestSingle(value, &builder); 20 | return &to[builder.position()]; 21 | } 22 | 23 | } // namespace util 24 | -------------------------------------------------------------------------------- /util/float_to_string.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_FLOAT_TO_STRING_H 2 | #define UTIL_FLOAT_TO_STRING_H 3 | 4 | // Just for ToStringBuf 5 | #include "util/integer_to_string.hh" 6 | 7 | namespace util { 8 | 9 | template <> struct ToStringBuf { 10 | // DoubleToStringConverter::kBase10MaximalLength + 1 for null paranoia. 11 | static const unsigned kBytes = 19; 12 | }; 13 | 14 | // Single wasn't documented in double conversion, so be conservative and 15 | // say the same as double. 16 | template <> struct ToStringBuf { 17 | static const unsigned kBytes = 19; 18 | }; 19 | 20 | char *ToString(double value, char *to); 21 | char *ToString(float value, char *to); 22 | 23 | } // namespace util 24 | 25 | #endif // UTIL_FLOAT_TO_STRING_H 26 | -------------------------------------------------------------------------------- /util/have.hh: -------------------------------------------------------------------------------- 1 | /* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ 2 | #ifndef UTIL_HAVE 3 | #define UTIL_HAVE 4 | 5 | #ifndef HAVE_BOOST 6 | //#define HAVE_BOOST 7 | #endif 8 | 9 | #endif // UTIL_HAVE 10 | -------------------------------------------------------------------------------- /util/integer_to_string.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_INTEGER_TO_STRING_H 2 | #define UTIL_INTEGER_TO_STRING_H 3 | #include 4 | #include 5 | 6 | namespace util { 7 | 8 | /* These functions convert integers to strings and return the end pointer. 9 | */ 10 | char *ToString(uint32_t value, char *to); 11 | char *ToString(uint64_t value, char *to); 12 | 13 | // Implemented as wrappers to above 14 | char *ToString(int32_t value, char *to); 15 | char *ToString(int64_t value, char *to); 16 | 17 | // Calls the 32-bit versions for now. 18 | char *ToString(uint16_t value, char *to); 19 | char *ToString(int16_t value, char *to); 20 | 21 | char *ToString(const void *value, char *to); 22 | 23 | inline char *ToString(bool value, char *to) { 24 | *to++ = '0' + value; 25 | return to; 26 | } 27 | 28 | // How many bytes to reserve in the buffer for these strings: 29 | // g++ 4.9.1 doesn't work with this: 30 | // static const std::size_t kBytes = 5; 31 | // So use enum. 32 | template struct ToStringBuf; 33 | template <> struct ToStringBuf { 34 | enum { kBytes = 1 }; 35 | }; 36 | template <> struct ToStringBuf { 37 | enum { kBytes = 5 }; 38 | }; 39 | template <> struct ToStringBuf { 40 | enum { kBytes = 6 }; 41 | }; 42 | template <> struct ToStringBuf { 43 | enum { kBytes = 10 }; 44 | }; 45 | template <> struct ToStringBuf { 46 | enum { kBytes = 11 }; 47 | }; 48 | template <> struct ToStringBuf { 49 | enum { kBytes = 20 }; 50 | }; 51 | template <> struct ToStringBuf { 52 | // Not a typo. 2^63 has 19 digits. 53 | enum { kBytes = 20 }; 54 | }; 55 | 56 | template <> struct ToStringBuf { 57 | // Either 18 on 64-bit or 10 on 32-bit. 58 | enum { kBytes = sizeof(const void*) * 2 + 2 }; 59 | }; 60 | 61 | // Maximum over this and float. 62 | enum { kToStringMaxBytes = 20 }; 63 | 64 | } // namespace util 65 | 66 | #endif // UTIL_INTEGER_TO_STRING_H 67 | -------------------------------------------------------------------------------- /util/integer_to_string_test.cc: -------------------------------------------------------------------------------- 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE 2 | #include "util/integer_to_string.hh" 3 | #include "util/string_piece.hh" 4 | 5 | #define BOOST_TEST_MODULE IntegerToStringTest 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace util { 12 | namespace { 13 | 14 | template void TestValue(const T value) { 15 | char buf[ToStringBuf::kBytes]; 16 | StringPiece result(buf, ToString(value, buf) - buf); 17 | BOOST_REQUIRE_GE(static_cast(ToStringBuf::kBytes), result.size()); 18 | if (value) { 19 | BOOST_CHECK_EQUAL(boost::lexical_cast(value), result); 20 | } else { 21 | // Platforms can do void * as 0x0 or 0. 22 | BOOST_CHECK(result == "0x0" || result == "0"); 23 | } 24 | } 25 | 26 | template void TestCorners() { 27 | TestValue(std::numeric_limits::min()); 28 | TestValue(std::numeric_limits::max()); 29 | TestValue((T)0); 30 | TestValue((T)-1); 31 | TestValue((T)1); 32 | } 33 | 34 | BOOST_AUTO_TEST_CASE(Corners) { 35 | TestCorners(); 36 | TestCorners(); 37 | TestCorners(); 38 | TestCorners(); 39 | TestCorners(); 40 | TestCorners(); 41 | TestCorners(); 42 | } 43 | 44 | template void TestAll() { 45 | for (T i = std::numeric_limits::min(); i < std::numeric_limits::max(); ++i) { 46 | TestValue(i); 47 | } 48 | TestValue(std::numeric_limits::max()); 49 | } 50 | 51 | BOOST_AUTO_TEST_CASE(Short) { 52 | TestAll(); 53 | TestAll(); 54 | } 55 | 56 | template void Test10s() { 57 | for (T i = 1; i < std::numeric_limits::max() / 10; i *= 10) { 58 | TestValue(i); 59 | TestValue(i - 1); 60 | TestValue(i + 1); 61 | } 62 | } 63 | 64 | BOOST_AUTO_TEST_CASE(Tens) { 65 | Test10s(); 66 | Test10s(); 67 | Test10s(); 68 | Test10s(); 69 | } 70 | 71 | BOOST_AUTO_TEST_CASE(Pointers) { 72 | for (uintptr_t i = 1; i < std::numeric_limits::max() / 10; i *= 10) { 73 | TestValue((const void*)i); 74 | } 75 | for (uintptr_t i = 0; i < 256; ++i) { 76 | TestValue((const void*)i); 77 | TestValue((const void*)(i + 0xf00)); 78 | } 79 | } 80 | 81 | }} // namespaces 82 | -------------------------------------------------------------------------------- /util/murmur_hash.cc: -------------------------------------------------------------------------------- 1 | /* Downloaded from http://sites.google.com/site/murmurhash/ which says "All 2 | * code is released to the public domain. For business purposes, Murmurhash is 3 | * under the MIT license." 4 | * This is modified from the original: 5 | * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit. 6 | * length changed to unsigned int. 7 | * placed in namespace util 8 | * add MurmurHashNative 9 | * default option = 0 for seed 10 | * ARM port from NICT 11 | */ 12 | 13 | #include "util/murmur_hash.hh" 14 | #include 15 | 16 | namespace util { 17 | 18 | //----------------------------------------------------------------------------- 19 | // MurmurHash2, 64-bit versions, by Austin Appleby 20 | 21 | // The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 22 | // and endian-ness issues if used across multiple platforms. 23 | 24 | // 64-bit hash for 64-bit platforms 25 | 26 | uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed ) 27 | { 28 | const uint64_t m = 0xc6a4a7935bd1e995ULL; 29 | const int r = 47; 30 | 31 | uint64_t h = seed ^ (len * m); 32 | 33 | #if defined(__arm) || defined(__arm__) 34 | const size_t ksize = sizeof(uint64_t); 35 | const unsigned char * data = (const unsigned char *)key; 36 | const unsigned char * end = data + (std::size_t)(len/8) * ksize; 37 | #else 38 | const uint64_t * data = (const uint64_t *)key; 39 | const uint64_t * end = data + (len/8); 40 | #endif 41 | 42 | while(data != end) 43 | { 44 | #if defined(__arm) || defined(__arm__) 45 | uint64_t k; 46 | memcpy(&k, data, ksize); 47 | data += ksize; 48 | #else 49 | uint64_t k = *data++; 50 | #endif 51 | 52 | k *= m; 53 | k ^= k >> r; 54 | k *= m; 55 | 56 | h ^= k; 57 | h *= m; 58 | } 59 | 60 | const unsigned char * data2 = (const unsigned char*)data; 61 | 62 | switch(len & 7) 63 | { 64 | case 7: h ^= uint64_t(data2[6]) << 48; 65 | case 6: h ^= uint64_t(data2[5]) << 40; 66 | case 5: h ^= uint64_t(data2[4]) << 32; 67 | case 4: h ^= uint64_t(data2[3]) << 24; 68 | case 3: h ^= uint64_t(data2[2]) << 16; 69 | case 2: h ^= uint64_t(data2[1]) << 8; 70 | case 1: h ^= uint64_t(data2[0]); 71 | h *= m; 72 | }; 73 | 74 | h ^= h >> r; 75 | h *= m; 76 | h ^= h >> r; 77 | 78 | return h; 79 | } 80 | 81 | 82 | // 64-bit hash for 32-bit platforms 83 | 84 | uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed ) 85 | { 86 | const unsigned int m = 0x5bd1e995; 87 | const int r = 24; 88 | 89 | unsigned int h1 = seed ^ len; 90 | unsigned int h2 = 0; 91 | 92 | #if defined(__arm) || defined(__arm__) 93 | size_t ksize = sizeof(unsigned int); 94 | const unsigned char * data = (const unsigned char *)key; 95 | #else 96 | const unsigned int * data = (const unsigned int *)key; 97 | #endif 98 | 99 | unsigned int k1, k2; 100 | while(len >= 8) 101 | { 102 | #if defined(__arm) || defined(__arm__) 103 | memcpy(&k1, data, ksize); 104 | data += ksize; 105 | memcpy(&k2, data, ksize); 106 | data += ksize; 107 | #else 108 | k1 = *data++; 109 | k2 = *data++; 110 | #endif 111 | 112 | k1 *= m; k1 ^= k1 >> r; k1 *= m; 113 | h1 *= m; h1 ^= k1; 114 | len -= 4; 115 | 116 | k2 *= m; k2 ^= k2 >> r; k2 *= m; 117 | h2 *= m; h2 ^= k2; 118 | len -= 4; 119 | } 120 | 121 | if(len >= 4) 122 | { 123 | #if defined(__arm) || defined(__arm__) 124 | memcpy(&k1, data, ksize); 125 | data += ksize; 126 | #else 127 | k1 = *data++; 128 | #endif 129 | k1 *= m; k1 ^= k1 >> r; k1 *= m; 130 | h1 *= m; h1 ^= k1; 131 | len -= 4; 132 | } 133 | 134 | switch(len) 135 | { 136 | case 3: h2 ^= ((unsigned char*)data)[2] << 16; 137 | case 2: h2 ^= ((unsigned char*)data)[1] << 8; 138 | case 1: h2 ^= ((unsigned char*)data)[0]; 139 | h2 *= m; 140 | }; 141 | 142 | h1 ^= h2 >> 18; h1 *= m; 143 | h2 ^= h1 >> 22; h2 *= m; 144 | h1 ^= h2 >> 17; h1 *= m; 145 | h2 ^= h1 >> 19; h2 *= m; 146 | 147 | uint64_t h = h1; 148 | 149 | h = (h << 32) | h2; 150 | 151 | return h; 152 | } 153 | 154 | // Trick to test for 64-bit architecture at compile time. 155 | namespace { 156 | #ifdef __clang__ 157 | #pragma clang diagnostic push 158 | #pragma clang diagnostic ignored "-Wunused-function" 159 | #endif 160 | template inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) { 161 | return MurmurHash64A(key, len, seed); 162 | } 163 | template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) { 164 | return MurmurHash64B(key, len, seed); 165 | } 166 | #ifdef __clang__ 167 | #pragma clang diagnostic pop 168 | #endif 169 | } // namespace 170 | 171 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { 172 | return MurmurHashNativeBackend(key, len, seed); 173 | } 174 | 175 | } // namespace util 176 | -------------------------------------------------------------------------------- /util/murmur_hash.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_MURMUR_HASH_H 2 | #define UTIL_MURMUR_HASH_H 3 | #include 4 | #include 5 | 6 | namespace util { 7 | 8 | // 64-bit machine version 9 | uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); 10 | // 32-bit machine version (not the same function as above) 11 | uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); 12 | // Use the version for this arch. Because the values differ across 13 | // architectures, really only use it for in-memory structures. 14 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); 15 | 16 | } // namespace util 17 | 18 | #endif // UTIL_MURMUR_HASH_H 19 | -------------------------------------------------------------------------------- /util/mutable_vocab.cc: -------------------------------------------------------------------------------- 1 | #include "util/mutable_vocab.hh" 2 | 3 | #include "util/murmur_hash.hh" 4 | 5 | namespace util { 6 | 7 | MutableVocab::MutableVocab() { 8 | strings_.push_back(StringPiece("")); 9 | } 10 | 11 | MutableVocab::ID MutableVocab::Find(const StringPiece &str) const { 12 | Map::ConstIterator it; 13 | if (map_.Find(util::MurmurHashNative(str.data(), str.size()), it)) { 14 | return it->id; 15 | } else { 16 | return kUNK; 17 | } 18 | } 19 | 20 | uint32_t MutableVocab::FindOrInsert(const StringPiece &str) { 21 | MutableVocabInternal entry; 22 | entry.key = util::MurmurHashNative(str.data(), str.size()); 23 | Map::MutableIterator it; 24 | if (map_.FindOrInsert(entry, it)) { 25 | return it->id; 26 | } 27 | it->id = strings_.size(); 28 | 29 | char *copied = static_cast(piece_backing_.Allocate(str.size())); 30 | memcpy(copied, str.data(), str.size()); 31 | strings_.push_back(StringPiece(copied, str.size())); 32 | return it->id; 33 | } 34 | 35 | } // namespace util 36 | -------------------------------------------------------------------------------- /util/mutable_vocab.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_MUTABLE_VOCAB__ 2 | #define UTIL_MUTABLE_VOCAB__ 3 | 4 | /* A vocabulary mapping class that's mutable at runtime. The kenlm code has 5 | * a specialized immutable vocabulary. 6 | */ 7 | 8 | #include "util/pool.hh" 9 | #include "util/probing_hash_table.hh" 10 | #include "util/string_piece.hh" 11 | 12 | #include 13 | 14 | namespace util { 15 | 16 | #pragma pack(push) 17 | #pragma pack(4) 18 | struct MutableVocabInternal { 19 | typedef uint64_t Key; 20 | uint64_t GetKey() const { return key; } 21 | void SetKey(uint64_t to) { key = to; } 22 | 23 | uint64_t key; 24 | uint32_t id; 25 | }; 26 | #pragma pack(pop) 27 | 28 | class MutableVocab { 29 | public: 30 | typedef uint32_t ID; 31 | 32 | static const ID kUNK = 0; 33 | 34 | MutableVocab(); 35 | 36 | uint32_t Find(const StringPiece &str) const; 37 | 38 | ID FindOrInsert(const StringPiece &str); 39 | 40 | StringPiece String(ID id) const { 41 | return strings_[id]; 42 | } 43 | 44 | // Includes kUNK. 45 | std::size_t Size() const { return strings_.size(); } 46 | 47 | private: 48 | util::Pool piece_backing_; 49 | 50 | typedef util::AutoProbing Map; 51 | Map map_; 52 | 53 | std::vector strings_; 54 | }; 55 | 56 | } // namespace util 57 | #endif // UTIL_MUTABLE_VOCAB__ 58 | -------------------------------------------------------------------------------- /util/mutable_vocab_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/mutable_vocab.hh" 2 | 3 | #define BOOST_TEST_MODULE MutableVocabTest 4 | #include 5 | 6 | namespace util { 7 | namespace { 8 | 9 | BOOST_AUTO_TEST_CASE(small) { 10 | MutableVocab vocab; 11 | BOOST_CHECK_EQUAL(1, vocab.FindOrInsert("Foo")); 12 | BOOST_CHECK_EQUAL(2, vocab.Size()); 13 | BOOST_CHECK_EQUAL(1, vocab.Find("Foo")); 14 | BOOST_CHECK_EQUAL("Foo", vocab.String(1)); 15 | } 16 | 17 | } // namespace 18 | } // namespace util 19 | -------------------------------------------------------------------------------- /util/object_pool.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_OBJECT_POOL_H 2 | #define UTIL_OBJECT_POOL_H 3 | 4 | #include "util/fixed_array.hh" 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace util { 11 | 12 | template class ObjectPool { 13 | public: 14 | ObjectPool() {} 15 | 16 | template T *Allocate(Construct... construct) { 17 | if (free_list_.empty() || 18 | (free_list_.back().begin() + Capacity(free_list_.size()) == free_list_.back().end())) { 19 | free_list_.emplace_back(Capacity(free_list_.size() + 1)); 20 | } 21 | free_list_.back().push_back(construct...); 22 | return &free_list_.back().back(); 23 | } 24 | 25 | void FreeAll() { 26 | free_list_.clear(); 27 | } 28 | 29 | private: 30 | static std::size_t Capacity(std::size_t index) { 31 | return 1ULL << index; 32 | } 33 | 34 | std::vector > free_list_; 35 | }; 36 | 37 | } // namespace util 38 | 39 | #endif // UTIL_OBJECT_POOL_H 40 | -------------------------------------------------------------------------------- /util/pcqueue_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/pcqueue.hh" 2 | 3 | #define BOOST_TEST_MODULE PCQueueTest 4 | #include 5 | 6 | #include 7 | 8 | namespace util { 9 | namespace { 10 | 11 | BOOST_AUTO_TEST_CASE(SingleThread) { 12 | PCQueue queue(10); 13 | for (int i = 0; i < 10; ++i) { 14 | queue.Produce(i); 15 | } 16 | for (int i = 0; i < 10; ++i) { 17 | BOOST_CHECK_EQUAL(i, queue.Consume()); 18 | } 19 | } 20 | 21 | BOOST_AUTO_TEST_CASE(SingleInSingleOut) { 22 | PCQueue queue(15); 23 | std::thread writer([&queue]() { 24 | for (int i = 0; i < 100; ++i) { 25 | queue.Produce(i); 26 | } 27 | }); 28 | for (int i = 0; i < 100; ++i) { 29 | BOOST_CHECK_EQUAL(i, queue.Consume()); 30 | } 31 | writer.join(); 32 | } 33 | 34 | void MultipleWriters() { 35 | const unsigned kCount = 2000; 36 | const unsigned kNumThreads = 4; 37 | PCQueue queue(13); 38 | auto writer = [&queue, kCount]() { 39 | for (unsigned i = 0; i < kCount; ++i) { 40 | queue.Produce(i); 41 | } 42 | }; 43 | std::vector threads; 44 | for (unsigned i = 0; i < kNumThreads; ++i) { 45 | threads.emplace_back(writer); 46 | } 47 | unsigned seen[kCount] = {0}; 48 | for (unsigned i = 0; i < kCount * kNumThreads; ++i) { 49 | unsigned got = queue.Consume(); 50 | BOOST_CHECK_LT(got, kCount); 51 | seen[got]++; 52 | // Since each thread generates in order, counts should be monotonically non-increasing. 53 | BOOST_CHECK(!got || seen[got] <= seen[got - 1]); 54 | } 55 | for (unsigned i = 0; i < kCount; ++i) { 56 | BOOST_CHECK_EQUAL(seen[i], kNumThreads); 57 | } 58 | for (std::thread &t : threads) { 59 | t.join(); 60 | } 61 | } 62 | 63 | } 64 | } // namespace util 65 | -------------------------------------------------------------------------------- /util/pool.cc: -------------------------------------------------------------------------------- 1 | #include "util/pool.hh" 2 | 3 | #include "util/scoped.hh" 4 | 5 | #include 6 | 7 | #include 8 | 9 | namespace util { 10 | 11 | Pool::Pool() { 12 | current_ = NULL; 13 | current_end_ = NULL; 14 | } 15 | 16 | Pool::~Pool() { 17 | FreeAll(); 18 | } 19 | 20 | void Pool::FreeAll() { 21 | for (std::vector::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) { 22 | free(*i); 23 | } 24 | free_list_.clear(); 25 | current_ = NULL; 26 | current_end_ = NULL; 27 | } 28 | 29 | void *Pool::More(std::size_t size) { 30 | std::size_t amount = std::max(static_cast(32) << free_list_.size(), size); 31 | uint8_t *ret = static_cast(MallocOrThrow(amount)); 32 | free_list_.push_back(ret); 33 | current_ = ret + size; 34 | current_end_ = ret + amount; 35 | return ret; 36 | } 37 | 38 | } // namespace util 39 | -------------------------------------------------------------------------------- /util/pool.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_POOL_H 2 | #define UTIL_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace util { 11 | 12 | /* Very simple pool. It can only allocate memory. And all of the memory it 13 | * allocates must be freed at the same time. 14 | */ 15 | class Pool { 16 | public: 17 | Pool(); 18 | 19 | ~Pool(); 20 | 21 | void *Allocate(std::size_t size) { 22 | void *ret = current_; 23 | current_ += size; 24 | if (current_ > current_end_) { 25 | ret = More(size); 26 | } 27 | #ifdef DEBUG 28 | base_check_ = ret; 29 | #endif 30 | return ret; 31 | } 32 | 33 | /** Extend (or contract) the most recent allocation. 34 | * @param base The base pointer of the allocation. This must must have been 35 | * returned by the MOST RECENT call to Allocate or Continue. 36 | * @param additional Change in the size. 37 | * 38 | * In most cases, more memory from the same page is used, in which case 39 | * base is unchanged and the function returns false. 40 | * If the page runs out, a new page is created and the memory (from base) 41 | * is copied. The function returns true. 42 | * 43 | * @return Whether the base had to be changed due to allocating a page. 44 | */ 45 | bool Continue(void *&base, std::ptrdiff_t additional) { 46 | #ifdef DEBUG 47 | assert(base == base_check_); 48 | #endif 49 | current_ += additional; 50 | if (current_ > current_end_) { 51 | std::size_t new_total = current_ - static_cast(base); 52 | void *new_base = More(new_total); 53 | std::memcpy(new_base, base, new_total - additional); 54 | base = new_base; 55 | #ifdef DEBUG 56 | base_check_ = base; 57 | #endif 58 | return true; 59 | } 60 | return false; 61 | } 62 | 63 | void FreeAll(); 64 | 65 | private: 66 | void *More(std::size_t size); 67 | 68 | std::vector free_list_; 69 | 70 | uint8_t *current_, *current_end_; 71 | 72 | #ifdef DEBUG 73 | // For debugging, check that Continue came from the most recent call. 74 | void *base_check_; 75 | #endif // DEBUG 76 | 77 | // no copying 78 | Pool(const Pool &); 79 | Pool &operator=(const Pool &); 80 | }; 81 | 82 | /** 83 | * Pool designed to allow limited freeing. 84 | * Keeps a linked list of free elements in the free spaces. 85 | * Will not reduce in size until FreeAll is called. 86 | */ 87 | class FreePool { 88 | public: 89 | explicit FreePool(std::size_t element_size) 90 | : free_list_(NULL), element_size_(element_size) {} 91 | 92 | void *Allocate() { 93 | if (free_list_) { 94 | void *ret = free_list_; 95 | free_list_ = *reinterpret_cast(free_list_); 96 | return ret; 97 | } else { 98 | return backing_.Allocate(element_size_); 99 | } 100 | } 101 | 102 | void Free(void *ptr) { 103 | *reinterpret_cast(ptr) = free_list_; 104 | free_list_ = ptr; 105 | } 106 | 107 | std::size_t ElementSize() const { return element_size_; } 108 | 109 | private: 110 | void *free_list_; 111 | 112 | Pool backing_; 113 | 114 | const std::size_t element_size_; 115 | }; 116 | 117 | } // namespace util 118 | 119 | #endif // UTIL_POOL_H 120 | -------------------------------------------------------------------------------- /util/probing_hash_table_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/probing_hash_table.hh" 2 | 3 | #include "util/murmur_hash.hh" 4 | #include "util/scoped.hh" 5 | 6 | #define BOOST_TEST_MODULE ProbingHashTableTest 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace util { 16 | namespace { 17 | 18 | struct Entry { 19 | unsigned char key; 20 | typedef unsigned char Key; 21 | 22 | unsigned char GetKey() const { 23 | return key; 24 | } 25 | 26 | void SetKey(unsigned char to) { 27 | key = to; 28 | } 29 | 30 | uint64_t GetValue() const { 31 | return value; 32 | } 33 | 34 | uint64_t value; 35 | }; 36 | 37 | typedef ProbingHashTable > Table; 38 | 39 | BOOST_AUTO_TEST_CASE(simple) { 40 | size_t size = Table::Size(10, 1.2); 41 | boost::scoped_array mem(new char[size]); 42 | memset(mem.get(), 0, size); 43 | 44 | Table table(mem.get(), size); 45 | const Entry *i = NULL; 46 | BOOST_CHECK(!table.Find(2, i)); 47 | Entry to_ins; 48 | to_ins.key = 3; 49 | to_ins.value = 328920; 50 | table.Insert(to_ins); 51 | BOOST_REQUIRE(table.Find(3, i)); 52 | BOOST_CHECK_EQUAL(3, i->GetKey()); 53 | BOOST_CHECK_EQUAL(static_cast(328920), i->GetValue()); 54 | BOOST_CHECK(!table.Find(2, i)); 55 | } 56 | 57 | struct Entry64 { 58 | uint64_t key; 59 | typedef uint64_t Key; 60 | 61 | Entry64() {} 62 | 63 | explicit Entry64(uint64_t key_in) { 64 | key = key_in; 65 | } 66 | 67 | Key GetKey() const { return key; } 68 | void SetKey(uint64_t to) { key = to; } 69 | }; 70 | 71 | struct MurmurHashEntry64 { 72 | std::size_t operator()(uint64_t value) const { 73 | return util::MurmurHash64A(&value, 8); 74 | } 75 | }; 76 | 77 | typedef ProbingHashTable Table64; 78 | 79 | BOOST_AUTO_TEST_CASE(Double) { 80 | for (std::size_t initial = 19; initial < 30; ++initial) { 81 | size_t size = Table64::Size(initial, 1.2); 82 | scoped_malloc mem(MallocOrThrow(size)); 83 | Table64 table(mem.get(), size, std::numeric_limits::max()); 84 | table.Clear(); 85 | for (uint64_t i = 0; i < 19; ++i) { 86 | table.Insert(Entry64(i)); 87 | } 88 | table.CheckConsistency(); 89 | mem.call_realloc(table.DoubleTo()); 90 | table.Double(mem.get()); 91 | table.CheckConsistency(); 92 | for (uint64_t i = 20; i < 40 ; ++i) { 93 | table.Insert(Entry64(i)); 94 | } 95 | mem.call_realloc(table.DoubleTo()); 96 | table.Double(mem.get()); 97 | table.CheckConsistency(); 98 | } 99 | } 100 | 101 | } // namespace 102 | } // namespace util 103 | -------------------------------------------------------------------------------- /util/scoped.cc: -------------------------------------------------------------------------------- 1 | #include "util/scoped.hh" 2 | 3 | #include 4 | #if !defined(_WIN32) && !defined(_WIN64) 5 | #include 6 | #endif 7 | 8 | namespace util { 9 | 10 | // TODO: if we're really under memory pressure, don't allocate memory to 11 | // display the error. 12 | MallocException::MallocException(std::size_t requested) throw() { 13 | *this << "for " << requested << " bytes "; 14 | } 15 | 16 | MallocException::~MallocException() throw() {} 17 | 18 | namespace { 19 | void *InspectAddr(void *addr, std::size_t requested, const char *func_name) { 20 | UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name); 21 | return addr; 22 | } 23 | } // namespace 24 | 25 | void *MallocOrThrow(std::size_t requested) { 26 | return InspectAddr(std::malloc(requested), requested, "malloc"); 27 | } 28 | 29 | void *CallocOrThrow(std::size_t requested) { 30 | return InspectAddr(std::calloc(requested, 1), requested, "calloc"); 31 | } 32 | 33 | void scoped_malloc::call_realloc(std::size_t requested) { 34 | p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc"); 35 | } 36 | 37 | void AdviseHugePages(const void *addr, std::size_t size) { 38 | #if MADV_HUGEPAGE 39 | madvise((void*)addr, size, MADV_HUGEPAGE); 40 | #endif 41 | } 42 | 43 | } // namespace util 44 | -------------------------------------------------------------------------------- /util/scoped.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_SCOPED_H 2 | #define UTIL_SCOPED_H 3 | /* Other scoped objects in the style of scoped_ptr. */ 4 | 5 | #include "util/exception.hh" 6 | #include 7 | #include 8 | 9 | namespace util { 10 | 11 | class MallocException : public ErrnoException { 12 | public: 13 | explicit MallocException(std::size_t requested) throw(); 14 | ~MallocException() throw(); 15 | }; 16 | 17 | void *MallocOrThrow(std::size_t requested); 18 | void *CallocOrThrow(std::size_t requested); 19 | 20 | /* Unfortunately, defining the operator* for void * makes the compiler complain. 21 | * So scoped is specialized to void. This includes the functionality common to 22 | * both, namely everything except reference. 23 | */ 24 | template class scoped_base { 25 | public: 26 | explicit scoped_base(T *p = NULL) : p_(p) {} 27 | 28 | ~scoped_base() { Closer::Close(p_); } 29 | 30 | #if __cplusplus >= 201103L 31 | scoped_base(scoped_base &&from) noexcept : p_(from.p_) { 32 | from.p_ = nullptr; 33 | } 34 | 35 | scoped_base &operator=(scoped_base &&from) noexcept { 36 | if (this != &from) { 37 | Closer::Close(p_); 38 | p_ = from.p_; 39 | from.p_ = nullptr; 40 | } 41 | return *this; 42 | } 43 | #endif 44 | 45 | void reset(T *p = NULL) { 46 | scoped_base other(p_); 47 | p_ = p; 48 | } 49 | 50 | T *get() { return p_; } 51 | const T *get() const { return p_; } 52 | 53 | T *operator->() { return p_; } 54 | const T *operator->() const { return p_; } 55 | 56 | T *release() { 57 | T *ret = p_; 58 | p_ = NULL; 59 | return ret; 60 | } 61 | 62 | protected: 63 | T *p_; 64 | 65 | #if __cplusplus >= 201103L 66 | public: 67 | scoped_base(const scoped_base &) = delete; 68 | scoped_base &operator=(const scoped_base &) = delete; 69 | #else 70 | private: 71 | scoped_base(const scoped_base &); 72 | scoped_base &operator=(const scoped_base &); 73 | #endif 74 | }; 75 | 76 | template class scoped : public scoped_base { 77 | public: 78 | explicit scoped(T *p = NULL) : scoped_base(p) {} 79 | 80 | T &operator*() { return *scoped_base::p_; } 81 | const T&operator*() const { return *scoped_base::p_; } 82 | }; 83 | 84 | template class scoped : public scoped_base { 85 | public: 86 | explicit scoped(void *p = NULL) : scoped_base(p) {} 87 | }; 88 | 89 | /* Closer for c functions like std::free and cmph cleanup functions */ 90 | template struct scoped_c_forward { 91 | static void Close(T *p) { clean(p); } 92 | }; 93 | // Call a C function to delete stuff 94 | template class scoped_c : public scoped > { 95 | public: 96 | explicit scoped_c(T *p = NULL) : scoped >(p) {} 97 | }; 98 | 99 | class scoped_malloc : public scoped_c { 100 | public: 101 | explicit scoped_malloc(void *p = NULL) : scoped_c(p) {} 102 | 103 | explicit scoped_malloc(std::size_t size) : scoped_c(MallocOrThrow(size)) {} 104 | 105 | void call_realloc(std::size_t to); 106 | }; 107 | 108 | /* scoped_array using delete[] */ 109 | struct scoped_delete_array_forward { 110 | template static void Close(T *p) { delete [] p; } 111 | }; 112 | // Hat tip to boost. 113 | template class scoped_array : public scoped { 114 | public: 115 | explicit scoped_array(T *p = NULL) : scoped(p) {} 116 | 117 | T &operator[](std::size_t idx) { return scoped::p_[idx]; } 118 | const T &operator[](std::size_t idx) const { return scoped::p_[idx]; } 119 | }; 120 | 121 | /* scoped_ptr using delete. If only there were a template typedef. */ 122 | struct scoped_delete_forward { 123 | template static void Close(T *p) { delete p; } 124 | }; 125 | template class scoped_ptr : public scoped { 126 | public: 127 | explicit scoped_ptr(T *p = NULL) : scoped(p) {} 128 | }; 129 | 130 | void AdviseHugePages(const void *addr, std::size_t size); 131 | 132 | } // namespace util 133 | 134 | #endif // UTIL_SCOPED_H 135 | -------------------------------------------------------------------------------- /util/spaces.cc: -------------------------------------------------------------------------------- 1 | #include "util/spaces.hh" 2 | 3 | namespace util { 4 | 5 | // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). 6 | const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 7 | 8 | } // namespace util 9 | -------------------------------------------------------------------------------- /util/spaces.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_SPACES_H 2 | #define UTIL_SPACES_H 3 | 4 | // bool array of spaces. 5 | 6 | namespace util { 7 | 8 | extern const bool kSpaces[256]; 9 | 10 | } // namespace util 11 | 12 | #endif // UTIL_SPACES_H 13 | -------------------------------------------------------------------------------- /util/string_stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STRING_STREAM_H 2 | #define UTIL_STRING_STREAM_H 3 | 4 | #include "util/fake_ostream.hh" 5 | 6 | #include 7 | #include 8 | 9 | namespace util { 10 | 11 | class StringStream : public FakeOStream { 12 | public: 13 | StringStream() {} 14 | 15 | StringStream &flush() { return *this; } 16 | 17 | StringStream &write(const void *data, std::size_t length) { 18 | out_.append(static_cast(data), length); 19 | return *this; 20 | } 21 | 22 | const std::string &str() const { return out_; } 23 | 24 | void str(const std::string &val) { out_ = val; } 25 | 26 | void swap(std::string &str) { std::swap(out_, str); } 27 | 28 | protected: 29 | friend class FakeOStream; 30 | char *Ensure(std::size_t amount) { 31 | std::size_t current = out_.size(); 32 | out_.resize(out_.size() + amount); 33 | return &out_[current]; 34 | } 35 | 36 | void AdvanceTo(char *to) { 37 | assert(to <= &*out_.end()); 38 | assert(to >= &*out_.begin()); 39 | out_.resize(to - &*out_.begin()); 40 | } 41 | 42 | private: 43 | std::string out_; 44 | }; 45 | 46 | } // namespace 47 | 48 | #endif // UTIL_STRING_STREAM_H 49 | -------------------------------------------------------------------------------- /util/string_stream_test.cc: -------------------------------------------------------------------------------- 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE 2 | #define BOOST_TEST_MODULE FakeOStreamTest 3 | 4 | #include "util/string_stream.hh" 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace util { namespace { 12 | 13 | template void TestEqual(const T value) { 14 | StringStream strme; 15 | strme << value; 16 | BOOST_CHECK_EQUAL(boost::lexical_cast(value), strme.str()); 17 | } 18 | 19 | template void TestCorners() { 20 | TestEqual(std::numeric_limits::max()); 21 | TestEqual(std::numeric_limits::min()); 22 | TestEqual(static_cast(0)); 23 | TestEqual(static_cast(-1)); 24 | TestEqual(static_cast(1)); 25 | } 26 | 27 | BOOST_AUTO_TEST_CASE(Integer) { 28 | TestCorners(); 29 | TestCorners(); 30 | TestCorners(); 31 | 32 | TestCorners(); 33 | TestCorners(); 34 | TestCorners(); 35 | 36 | TestCorners(); 37 | TestCorners(); 38 | TestCorners(); 39 | 40 | TestCorners(); 41 | TestCorners(); 42 | TestCorners(); 43 | 44 | TestCorners(); 45 | TestCorners(); 46 | TestCorners(); 47 | 48 | TestCorners(); 49 | } 50 | 51 | enum TinyEnum { EnumValue }; 52 | 53 | BOOST_AUTO_TEST_CASE(EnumCase) { 54 | TestEqual(EnumValue); 55 | } 56 | 57 | BOOST_AUTO_TEST_CASE(Strings) { 58 | TestEqual("foo"); 59 | const char *a = "bar"; 60 | TestEqual(a); 61 | StringPiece piece("abcdef"); 62 | TestEqual(piece); 63 | TestEqual(StringPiece()); 64 | 65 | char non_const[3]; 66 | non_const[0] = 'b'; 67 | non_const[1] = 'c'; 68 | non_const[2] = 0; 69 | 70 | StringStream out; 71 | out << "a" << non_const << 'c'; 72 | BOOST_CHECK_EQUAL("abcc", out.str()); 73 | 74 | // Now test as a separate object. 75 | StringStream stream; 76 | stream << "a" << non_const << 'c' << piece; 77 | BOOST_CHECK_EQUAL("abccabcdef", stream.str()); 78 | } 79 | 80 | }} // namespaces 81 | -------------------------------------------------------------------------------- /util/tokenize_piece_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/tokenize_piece.hh" 2 | #include "util/string_piece.hh" 3 | 4 | #define BOOST_TEST_MODULE TokenIteratorTest 5 | #include 6 | 7 | #include 8 | 9 | namespace util { 10 | namespace { 11 | 12 | BOOST_AUTO_TEST_CASE(pipe_pipe_none) { 13 | const char str[] = "nodelimit at all"; 14 | TokenIter it(str, MultiCharacter("|||")); 15 | BOOST_REQUIRE(it); 16 | BOOST_CHECK_EQUAL(StringPiece(str), *it); 17 | ++it; 18 | BOOST_CHECK(!it); 19 | } 20 | BOOST_AUTO_TEST_CASE(pipe_pipe_two) { 21 | const char str[] = "|||"; 22 | TokenIter it(str, MultiCharacter("|||")); 23 | BOOST_REQUIRE(it); 24 | BOOST_CHECK_EQUAL(StringPiece(), *it); 25 | ++it; 26 | BOOST_REQUIRE(it); 27 | BOOST_CHECK_EQUAL(StringPiece(), *it); 28 | ++it; 29 | BOOST_CHECK(!it); 30 | } 31 | 32 | BOOST_AUTO_TEST_CASE(remove_empty) { 33 | const char str[] = "|||"; 34 | TokenIter it(str, MultiCharacter("|||")); 35 | BOOST_CHECK(!it); 36 | } 37 | 38 | BOOST_AUTO_TEST_CASE(remove_empty_keep) { 39 | const char str[] = " |||"; 40 | TokenIter it(str, MultiCharacter("|||")); 41 | BOOST_REQUIRE(it); 42 | BOOST_CHECK_EQUAL(StringPiece(" "), *it); 43 | ++it; 44 | BOOST_CHECK(!it); 45 | } 46 | 47 | } // namespace 48 | } // namespace util 49 | -------------------------------------------------------------------------------- /util/utf8.cc: -------------------------------------------------------------------------------- 1 | #include "util/utf8.hh" 2 | 3 | #include "util/string_piece.hh" 4 | 5 | namespace util { 6 | 7 | NotUTF8Exception::NotUTF8Exception(const StringPiece &) throw() {} 8 | 9 | NotUTF8Exception::~NotUTF8Exception() throw() {} 10 | 11 | bool IsUTF8(const StringPiece &str) { 12 | try { 13 | for (char32_t character : DecodeUTF8Range(str)) { 14 | (void)character; /*unused variable */ 15 | } 16 | return true; 17 | } catch (const NotUTF8Exception &) { 18 | return false; 19 | } 20 | } 21 | 22 | } // namespace util 23 | -------------------------------------------------------------------------------- /util/utf8_icu.hh: -------------------------------------------------------------------------------- 1 | /* Utilities for UTF-8 that require ICU. */ 2 | 3 | #ifndef UTIL_UTF8_ICU 4 | #define UTIL_UTF8_ICU 5 | 6 | #include "util/string_piece.hh" 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | U_NAMESPACE_BEGIN 14 | class UnicodeString; 15 | U_NAMESPACE_END 16 | 17 | namespace util { 18 | 19 | class NormalizeException : public std::exception { 20 | public: 21 | NormalizeException(const StringPiece &original, UErrorCode code) throw(); 22 | ~NormalizeException() throw() {} 23 | 24 | const char *what() const throw() { return what_.c_str(); } 25 | 26 | private: 27 | std::string original_; 28 | 29 | std::string what_; 30 | }; 31 | 32 | 33 | class ICUStupidlyUses32BitIntegersException : public std::exception { 34 | public: 35 | ~ICUStupidlyUses32BitIntegersException(); 36 | const char *what() const throw(); 37 | }; 38 | 39 | // TODO: Implement these in a way that doesn't botch Turkish. 40 | void ToLower(const StringPiece &in, std::string &out); 41 | 42 | void Normalize(const U_ICU_NAMESPACE::UnicodeString &in, U_ICU_NAMESPACE::UnicodeString &out); 43 | void Normalize(const StringPiece &in, std::string &out); 44 | 45 | class UnsupportedLanguageException : public std::exception { 46 | public: 47 | explicit UnsupportedLanguageException(const StringPiece &language) throw(); 48 | ~UnsupportedLanguageException() throw() {} 49 | 50 | const char *what() const throw() { return what_.c_str(); } 51 | 52 | const std::string &Language() const { return language_; } 53 | 54 | private: 55 | std::string language_; 56 | std::string what_; 57 | }; 58 | 59 | /* Technically Flatten could be done without ICU but then it's only used in process_unicode that wants UnicodeString */ 60 | class FlattenData; 61 | 62 | class Flatten { 63 | public: 64 | explicit Flatten(const StringPiece &language); 65 | 66 | void Apply(const StringPiece &in, std::string &out) const; 67 | void Apply(const U_ICU_NAMESPACE::UnicodeString &in, U_ICU_NAMESPACE::UnicodeString &out) const; 68 | 69 | private: 70 | const FlattenData &data_; 71 | }; 72 | 73 | } // namespace util 74 | 75 | #endif // UTIL_UTF8_ICU 76 | -------------------------------------------------------------------------------- /util/utf8_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/utf8.hh" 2 | #include "util/utf8_icu.hh" 3 | 4 | #define BOOST_TEST_MODULE UTF8Test 5 | #include 6 | 7 | #define CHECK_LOWER(ref, from) { \ 8 | std::string out; \ 9 | ToLower(from, out); \ 10 | BOOST_CHECK_EQUAL(ref, out); \ 11 | } 12 | 13 | #define CHECK_NORMALIZE(ref, from) { \ 14 | std::string out; \ 15 | Normalize(from, out); \ 16 | BOOST_CHECK_EQUAL(ref, out); \ 17 | } 18 | 19 | #define CHECK_FLATTEN(ref, from, language) { \ 20 | Flatten flat(language); \ 21 | std::string out; \ 22 | flat.Apply(from, out); \ 23 | BOOST_CHECK_EQUAL(ref, out); \ 24 | } 25 | 26 | namespace util { 27 | namespace { 28 | 29 | BOOST_AUTO_TEST_CASE(ASCII) { 30 | CHECK_LOWER("foo", "FOO"); 31 | CHECK_LOWER("foobaz", "fooBAz"); 32 | } 33 | 34 | BOOST_AUTO_TEST_CASE(Accents) { 35 | CHECK_LOWER("ôæðø", "ôÆÐØ"); 36 | } 37 | 38 | BOOST_AUTO_TEST_CASE(Thorn) { 39 | CHECK_LOWER("þ", "Þ"); 40 | } 41 | 42 | BOOST_AUTO_TEST_CASE(NormalizeASCII) { 43 | CHECK_NORMALIZE("foo", "foo"); 44 | } 45 | 46 | // This is a valid letter in some languages 47 | BOOST_AUTO_TEST_CASE(NormalizeAE) { 48 | CHECK_NORMALIZE("æ", "æ"); 49 | } 50 | 51 | BOOST_AUTO_TEST_CASE(NormalizeFI) { 52 | CHECK_NORMALIZE("fi", "fi"); 53 | } 54 | 55 | BOOST_AUTO_TEST_CASE(NormalizeFive) { 56 | CHECK_NORMALIZE("5", "⁵"); 57 | } 58 | 59 | BOOST_AUTO_TEST_CASE(FlattenEnglish) { 60 | CHECK_FLATTEN("\"foo bar\" '", "«foo bar» '", "en"); 61 | } 62 | 63 | BOOST_AUTO_TEST_CASE(FlattenFrench) { 64 | CHECK_FLATTEN("«foo bar»", "``foo bar''", "fr"); 65 | } 66 | 67 | BOOST_AUTO_TEST_CASE(FlattenBunch) { 68 | CHECK_FLATTEN("...oeAe\"'s ", "…œÆ''' s ", "en"); 69 | } 70 | 71 | BOOST_AUTO_TEST_CASE(FlattenPossessive) { 72 | CHECK_FLATTEN("'s", "' s", "en"); 73 | CHECK_FLATTEN("'s ", "' s ", "en"); 74 | CHECK_FLATTEN("a's", "a' s", "en"); 75 | CHECK_FLATTEN("a's ", "a' s ", "en"); 76 | CHECK_FLATTEN("' sfoo", "' sfoo", "en"); 77 | CHECK_FLATTEN("' sfoo ", "' sfoo ", "en"); 78 | } 79 | 80 | BOOST_AUTO_TEST_CASE(FailLarge) { 81 | StringPiece large(0, 1ULL << 32); 82 | std::string out; 83 | BOOST_CHECK_THROW(ToLower(large, out), ICUStupidlyUses32BitIntegersException); 84 | } 85 | 86 | BOOST_AUTO_TEST_CASE(IsUTF8Test) { 87 | BOOST_CHECK(IsUTF8("…œÆ5ôÆÐØôæðø")); 88 | BOOST_CHECK(!IsUTF8("…œ\xaaÆ5œÆ5ôÆÐØôæðø")); 89 | } 90 | 91 | BOOST_AUTO_TEST_CASE(Iterator) { 92 | DecodeUTF8Range range("\ufefffi«🤦a"); 93 | DecodeUTF8Iterator i = range.begin(); 94 | BOOST_CHECK(i != range.end()); 95 | BOOST_CHECK(!range.end()); 96 | BOOST_CHECK_EQUAL(0xfeff, *i++); 97 | BOOST_CHECK_EQUAL(0xFB01, *i++); 98 | BOOST_CHECK_EQUAL(0xAB, *i++); 99 | BOOST_CHECK_EQUAL(0x1F926, *i++); 100 | BOOST_CHECK_EQUAL('a', *i++); 101 | BOOST_CHECK(!i); 102 | BOOST_CHECK(i == range.end()); 103 | } 104 | 105 | /* This has been tested but it uses > 2 GB virtual memory so isn't enabled by default. */ 106 | /* BOOST_AUTO_TEST_CASE(LargeIsUTF8) { 107 | const size_t kBufferSize = (1ULL << 32) + 30ULL; 108 | std::vector buffer(kBufferSize); 109 | StringPiece big(&*buffer.begin(), kBufferSize); 110 | BOOST_CHECK(IsUTF8(big)); 111 | buffer[0] = 129; 112 | BOOST_CHECK(!IsUTF8(big)); 113 | buffer[0] = 0; 114 | buffer[1ULL << 32] = 129; 115 | BOOST_CHECK(!IsUTF8(big)); 116 | }*/ 117 | 118 | 119 | } // namespace 120 | } // namespace util 121 | --------------------------------------------------------------------------------