├── .github └── workflows │ └── ubuntu.yml ├── .gitignore ├── CMakeLists.txt ├── COPYING ├── COPYING.LESSER ├── FindICU.cmake ├── LICENSE ├── README.md ├── moses ├── CMakeLists.txt ├── ems │ └── support │ │ └── split-sentences.perl ├── share │ └── nonbreaking_prefixes │ │ ├── README.txt │ │ ├── nonbreaking_prefix.as │ │ ├── nonbreaking_prefix.bn │ │ ├── nonbreaking_prefix.ca │ │ ├── nonbreaking_prefix.cs │ │ ├── nonbreaking_prefix.de │ │ ├── nonbreaking_prefix.el │ │ ├── nonbreaking_prefix.en │ │ ├── nonbreaking_prefix.es │ │ ├── nonbreaking_prefix.et │ │ ├── nonbreaking_prefix.fi │ │ ├── nonbreaking_prefix.fr │ │ ├── nonbreaking_prefix.ga │ │ ├── nonbreaking_prefix.gu │ │ ├── nonbreaking_prefix.hi │ │ ├── nonbreaking_prefix.hu │ │ ├── nonbreaking_prefix.is │ │ ├── nonbreaking_prefix.it │ │ ├── nonbreaking_prefix.kn │ │ ├── nonbreaking_prefix.lt │ │ ├── nonbreaking_prefix.lv │ │ ├── nonbreaking_prefix.ml │ │ ├── nonbreaking_prefix.mni │ │ ├── nonbreaking_prefix.mr │ │ ├── nonbreaking_prefix.nl │ │ ├── nonbreaking_prefix.or │ │ ├── nonbreaking_prefix.pa │ │ ├── nonbreaking_prefix.pl │ │ ├── nonbreaking_prefix.pt │ │ ├── nonbreaking_prefix.ro │ │ ├── nonbreaking_prefix.ru │ │ ├── nonbreaking_prefix.sk │ │ ├── nonbreaking_prefix.sl │ │ ├── nonbreaking_prefix.sv │ │ ├── nonbreaking_prefix.ta │ │ ├── nonbreaking_prefix.te │ │ ├── nonbreaking_prefix.yue │ │ └── nonbreaking_prefix.zh └── tokenizer │ ├── deescape-special-chars.perl │ ├── detokenizer.perl │ ├── escape-special-chars.perl │ ├── lowercase.perl │ ├── normalize-punctuation.perl │ └── tokenizer.perl ├── preprocess ├── CMakeLists.txt ├── apply_case_main.cc ├── b64filter_main.cc ├── base64.cc ├── base64.hh ├── base64_number_main.cc ├── cache_main.cc ├── captive_child.cc ├── captive_child.hh ├── commoncrawl_dedupe_main.cc ├── dedupe_main.cc ├── docenc_main.cc ├── fields.cc ├── fields.hh ├── foldfilter_main.cc ├── gigaword_extract.sh ├── gigaword_unwrap_main.cc ├── heuristics.perl ├── idf_main.cc ├── mmhsum_main.cc ├── order_independent_hash_main.cc ├── parallel.hh ├── process_unicode_main.cc ├── remove_invalid_utf8_base64_main.cc ├── remove_invalid_utf8_main.cc ├── remove_long_lines_main.cc ├── resplit.sh ├── shard_main.cc ├── simple_cleaning_main.cc ├── substitute_main.cc ├── subtract_lines_main.cc ├── tests │ ├── cache │ │ ├── input │ │ ├── run.sh │ │ ├── space_expected │ │ └── space_ref.py │ ├── dedupe │ │ ├── columns │ │ ├── columns.out │ │ ├── expected │ │ ├── input │ │ ├── ref.py │ │ └── run.sh │ ├── foldfilter │ │ ├── fold10.expected │ │ ├── input │ │ └── run.sh │ ├── run.sh │ ├── shard │ │ ├── input │ │ └── run.sh │ └── vars ├── text.sh ├── train_case_main.cc ├── truecase_main.cc ├── unescape_html.perl ├── vocab_main.cc ├── warc.cc ├── warc.hh └── warc_parallel_main.cc └── util ├── CMakeLists.txt ├── buffered_stream.hh ├── cat_compressed_main.cc ├── compress.cc ├── compress.hh ├── compress_test.cc ├── double-conversion ├── CMakeLists.txt ├── Jamfile ├── LICENSE ├── bignum-dtoa.cc ├── bignum-dtoa.h ├── bignum.cc ├── bignum.h ├── cached-powers.cc ├── cached-powers.h ├── diy-fp.cc ├── diy-fp.h ├── double-conversion.cc ├── double-conversion.h ├── fast-dtoa.cc ├── fast-dtoa.h ├── fixed-dtoa.cc ├── fixed-dtoa.h ├── ieee.h ├── strtod.cc ├── strtod.h └── utils.h ├── ersatz_progress.cc ├── ersatz_progress.hh ├── exception.cc ├── exception.hh ├── fake_ostream.hh ├── file.cc ├── file.hh ├── file_piece.cc ├── file_piece.hh ├── file_piece_test.cc ├── file_stream.hh ├── fixed_array.hh ├── float_to_string.cc ├── float_to_string.hh ├── have.hh ├── integer_to_string.cc ├── integer_to_string.hh ├── integer_to_string_test.cc ├── mmap.cc ├── mmap.hh ├── murmur_hash.cc ├── murmur_hash.hh ├── mutable_vocab.cc ├── mutable_vocab.hh ├── mutable_vocab_test.cc ├── object_pool.hh ├── pcqueue.hh ├── pcqueue_test.cc ├── pool.cc ├── pool.hh ├── probing_hash_table.hh ├── probing_hash_table_test.cc ├── scoped.cc ├── scoped.hh ├── spaces.cc ├── spaces.hh ├── string_piece.cc ├── string_piece.hh ├── string_stream.hh ├── string_stream_test.cc ├── threaded_buffered_stream.hh ├── tokenize_piece.hh ├── tokenize_piece_test.cc ├── utf8.cc ├── utf8.hh ├── utf8_icu.cc ├── utf8_icu.hh └── utf8_test.cc /.github/workflows/ubuntu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/.github/workflows/ubuntu.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | util/file_piece.cc.gz 3 | *.swp 4 | *.o 5 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/COPYING -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/COPYING.LESSER -------------------------------------------------------------------------------- /FindICU.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/FindICU.cmake -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/README.md -------------------------------------------------------------------------------- /moses/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/CMakeLists.txt -------------------------------------------------------------------------------- /moses/ems/support/split-sentences.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/ems/support/split-sentences.perl -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/README.txt -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.as: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.as -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.bn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.bn -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ca -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.cs -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.de -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.el: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.el -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.en -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.es -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.et: -------------------------------------------------------------------------------- 1 | nonbreaking_prefix.fi -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.fi -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.fr -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ga: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ga -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.gu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.gu -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.hi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.hi -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.hu -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.is -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.it -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.kn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.kn -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.lt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.lt -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.lv -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ml -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.mni: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.mni -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.mr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.mr -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.nl -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.or: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.or -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.pa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.pa -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.pl -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.pt -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ro -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ru -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.sk -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.sl -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.sv -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.ta -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.te: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.te -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.yue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.yue -------------------------------------------------------------------------------- /moses/share/nonbreaking_prefixes/nonbreaking_prefix.zh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/share/nonbreaking_prefixes/nonbreaking_prefix.zh -------------------------------------------------------------------------------- /moses/tokenizer/deescape-special-chars.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/tokenizer/deescape-special-chars.perl -------------------------------------------------------------------------------- /moses/tokenizer/detokenizer.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/tokenizer/detokenizer.perl -------------------------------------------------------------------------------- /moses/tokenizer/escape-special-chars.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/tokenizer/escape-special-chars.perl -------------------------------------------------------------------------------- /moses/tokenizer/lowercase.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/tokenizer/lowercase.perl -------------------------------------------------------------------------------- /moses/tokenizer/normalize-punctuation.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/tokenizer/normalize-punctuation.perl -------------------------------------------------------------------------------- /moses/tokenizer/tokenizer.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/moses/tokenizer/tokenizer.perl -------------------------------------------------------------------------------- /preprocess/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/CMakeLists.txt -------------------------------------------------------------------------------- /preprocess/apply_case_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/apply_case_main.cc -------------------------------------------------------------------------------- /preprocess/b64filter_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/b64filter_main.cc -------------------------------------------------------------------------------- /preprocess/base64.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/base64.cc -------------------------------------------------------------------------------- /preprocess/base64.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/base64.hh -------------------------------------------------------------------------------- /preprocess/base64_number_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/base64_number_main.cc -------------------------------------------------------------------------------- /preprocess/cache_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/cache_main.cc -------------------------------------------------------------------------------- /preprocess/captive_child.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/captive_child.cc -------------------------------------------------------------------------------- /preprocess/captive_child.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/captive_child.hh -------------------------------------------------------------------------------- /preprocess/commoncrawl_dedupe_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/commoncrawl_dedupe_main.cc -------------------------------------------------------------------------------- /preprocess/dedupe_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/dedupe_main.cc -------------------------------------------------------------------------------- /preprocess/docenc_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/docenc_main.cc -------------------------------------------------------------------------------- /preprocess/fields.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/fields.cc -------------------------------------------------------------------------------- /preprocess/fields.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/fields.hh -------------------------------------------------------------------------------- /preprocess/foldfilter_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/foldfilter_main.cc -------------------------------------------------------------------------------- /preprocess/gigaword_extract.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/gigaword_extract.sh -------------------------------------------------------------------------------- /preprocess/gigaword_unwrap_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/gigaword_unwrap_main.cc -------------------------------------------------------------------------------- /preprocess/heuristics.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/heuristics.perl -------------------------------------------------------------------------------- /preprocess/idf_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/idf_main.cc -------------------------------------------------------------------------------- /preprocess/mmhsum_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/mmhsum_main.cc -------------------------------------------------------------------------------- /preprocess/order_independent_hash_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/order_independent_hash_main.cc -------------------------------------------------------------------------------- /preprocess/parallel.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/parallel.hh -------------------------------------------------------------------------------- /preprocess/process_unicode_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/process_unicode_main.cc -------------------------------------------------------------------------------- /preprocess/remove_invalid_utf8_base64_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/remove_invalid_utf8_base64_main.cc -------------------------------------------------------------------------------- /preprocess/remove_invalid_utf8_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/remove_invalid_utf8_main.cc -------------------------------------------------------------------------------- /preprocess/remove_long_lines_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/remove_long_lines_main.cc -------------------------------------------------------------------------------- /preprocess/resplit.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/resplit.sh -------------------------------------------------------------------------------- /preprocess/shard_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/shard_main.cc -------------------------------------------------------------------------------- /preprocess/simple_cleaning_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/simple_cleaning_main.cc -------------------------------------------------------------------------------- /preprocess/substitute_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/substitute_main.cc -------------------------------------------------------------------------------- /preprocess/subtract_lines_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/subtract_lines_main.cc -------------------------------------------------------------------------------- /preprocess/tests/cache/input: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/cache/input -------------------------------------------------------------------------------- /preprocess/tests/cache/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/cache/run.sh -------------------------------------------------------------------------------- /preprocess/tests/cache/space_expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/cache/space_expected -------------------------------------------------------------------------------- /preprocess/tests/cache/space_ref.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/cache/space_ref.py -------------------------------------------------------------------------------- /preprocess/tests/dedupe/columns: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/dedupe/columns -------------------------------------------------------------------------------- /preprocess/tests/dedupe/columns.out: -------------------------------------------------------------------------------- 1 | 1 a 2 | 3 b 3 | -------------------------------------------------------------------------------- /preprocess/tests/dedupe/expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/dedupe/expected -------------------------------------------------------------------------------- /preprocess/tests/dedupe/input: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/dedupe/input -------------------------------------------------------------------------------- /preprocess/tests/dedupe/ref.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/dedupe/ref.py -------------------------------------------------------------------------------- /preprocess/tests/dedupe/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/dedupe/run.sh -------------------------------------------------------------------------------- /preprocess/tests/foldfilter/fold10.expected: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/foldfilter/fold10.expected -------------------------------------------------------------------------------- /preprocess/tests/foldfilter/input: -------------------------------------------------------------------------------- 1 | ../../../COPYING -------------------------------------------------------------------------------- /preprocess/tests/foldfilter/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/foldfilter/run.sh -------------------------------------------------------------------------------- /preprocess/tests/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/run.sh -------------------------------------------------------------------------------- /preprocess/tests/shard/input: -------------------------------------------------------------------------------- 1 | ../../../README.md -------------------------------------------------------------------------------- /preprocess/tests/shard/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/shard/run.sh -------------------------------------------------------------------------------- /preprocess/tests/vars: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/tests/vars -------------------------------------------------------------------------------- /preprocess/text.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/text.sh -------------------------------------------------------------------------------- /preprocess/train_case_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/train_case_main.cc -------------------------------------------------------------------------------- /preprocess/truecase_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/truecase_main.cc -------------------------------------------------------------------------------- /preprocess/unescape_html.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/unescape_html.perl -------------------------------------------------------------------------------- /preprocess/vocab_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/vocab_main.cc -------------------------------------------------------------------------------- /preprocess/warc.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/warc.cc -------------------------------------------------------------------------------- /preprocess/warc.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/warc.hh -------------------------------------------------------------------------------- /preprocess/warc_parallel_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/preprocess/warc_parallel_main.cc -------------------------------------------------------------------------------- /util/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/CMakeLists.txt -------------------------------------------------------------------------------- /util/buffered_stream.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/buffered_stream.hh -------------------------------------------------------------------------------- /util/cat_compressed_main.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/cat_compressed_main.cc -------------------------------------------------------------------------------- /util/compress.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/compress.cc -------------------------------------------------------------------------------- /util/compress.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/compress.hh -------------------------------------------------------------------------------- /util/compress_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/compress_test.cc -------------------------------------------------------------------------------- /util/double-conversion/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/CMakeLists.txt -------------------------------------------------------------------------------- /util/double-conversion/Jamfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/Jamfile -------------------------------------------------------------------------------- /util/double-conversion/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/LICENSE -------------------------------------------------------------------------------- /util/double-conversion/bignum-dtoa.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/bignum-dtoa.cc -------------------------------------------------------------------------------- /util/double-conversion/bignum-dtoa.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/bignum-dtoa.h -------------------------------------------------------------------------------- /util/double-conversion/bignum.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/bignum.cc -------------------------------------------------------------------------------- /util/double-conversion/bignum.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/bignum.h -------------------------------------------------------------------------------- /util/double-conversion/cached-powers.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/cached-powers.cc -------------------------------------------------------------------------------- /util/double-conversion/cached-powers.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/cached-powers.h -------------------------------------------------------------------------------- /util/double-conversion/diy-fp.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/diy-fp.cc -------------------------------------------------------------------------------- /util/double-conversion/diy-fp.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/diy-fp.h -------------------------------------------------------------------------------- /util/double-conversion/double-conversion.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/double-conversion.cc -------------------------------------------------------------------------------- /util/double-conversion/double-conversion.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/double-conversion.h -------------------------------------------------------------------------------- /util/double-conversion/fast-dtoa.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/fast-dtoa.cc -------------------------------------------------------------------------------- /util/double-conversion/fast-dtoa.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/fast-dtoa.h -------------------------------------------------------------------------------- /util/double-conversion/fixed-dtoa.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/fixed-dtoa.cc -------------------------------------------------------------------------------- /util/double-conversion/fixed-dtoa.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/fixed-dtoa.h -------------------------------------------------------------------------------- /util/double-conversion/ieee.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/ieee.h -------------------------------------------------------------------------------- /util/double-conversion/strtod.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/strtod.cc -------------------------------------------------------------------------------- /util/double-conversion/strtod.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/strtod.h -------------------------------------------------------------------------------- /util/double-conversion/utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/double-conversion/utils.h -------------------------------------------------------------------------------- /util/ersatz_progress.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/ersatz_progress.cc -------------------------------------------------------------------------------- /util/ersatz_progress.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/ersatz_progress.hh -------------------------------------------------------------------------------- /util/exception.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/exception.cc -------------------------------------------------------------------------------- /util/exception.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/exception.hh -------------------------------------------------------------------------------- /util/fake_ostream.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/fake_ostream.hh -------------------------------------------------------------------------------- /util/file.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/file.cc -------------------------------------------------------------------------------- /util/file.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/file.hh -------------------------------------------------------------------------------- /util/file_piece.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/file_piece.cc -------------------------------------------------------------------------------- /util/file_piece.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/file_piece.hh -------------------------------------------------------------------------------- /util/file_piece_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/file_piece_test.cc -------------------------------------------------------------------------------- /util/file_stream.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/file_stream.hh -------------------------------------------------------------------------------- /util/fixed_array.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/fixed_array.hh -------------------------------------------------------------------------------- /util/float_to_string.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/float_to_string.cc -------------------------------------------------------------------------------- /util/float_to_string.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/float_to_string.hh -------------------------------------------------------------------------------- /util/have.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/have.hh -------------------------------------------------------------------------------- /util/integer_to_string.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/integer_to_string.cc -------------------------------------------------------------------------------- /util/integer_to_string.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/integer_to_string.hh -------------------------------------------------------------------------------- /util/integer_to_string_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/integer_to_string_test.cc -------------------------------------------------------------------------------- /util/mmap.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/mmap.cc -------------------------------------------------------------------------------- /util/mmap.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/mmap.hh -------------------------------------------------------------------------------- /util/murmur_hash.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/murmur_hash.cc -------------------------------------------------------------------------------- /util/murmur_hash.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/murmur_hash.hh -------------------------------------------------------------------------------- /util/mutable_vocab.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/mutable_vocab.cc -------------------------------------------------------------------------------- /util/mutable_vocab.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/mutable_vocab.hh -------------------------------------------------------------------------------- /util/mutable_vocab_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/mutable_vocab_test.cc -------------------------------------------------------------------------------- /util/object_pool.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/object_pool.hh -------------------------------------------------------------------------------- /util/pcqueue.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/pcqueue.hh -------------------------------------------------------------------------------- /util/pcqueue_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/pcqueue_test.cc -------------------------------------------------------------------------------- /util/pool.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/pool.cc -------------------------------------------------------------------------------- /util/pool.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/pool.hh -------------------------------------------------------------------------------- /util/probing_hash_table.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/probing_hash_table.hh -------------------------------------------------------------------------------- /util/probing_hash_table_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/probing_hash_table_test.cc -------------------------------------------------------------------------------- /util/scoped.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/scoped.cc -------------------------------------------------------------------------------- /util/scoped.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/scoped.hh -------------------------------------------------------------------------------- /util/spaces.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/spaces.cc -------------------------------------------------------------------------------- /util/spaces.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/spaces.hh -------------------------------------------------------------------------------- /util/string_piece.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/string_piece.cc -------------------------------------------------------------------------------- /util/string_piece.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/string_piece.hh -------------------------------------------------------------------------------- /util/string_stream.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/string_stream.hh -------------------------------------------------------------------------------- /util/string_stream_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/string_stream_test.cc -------------------------------------------------------------------------------- /util/threaded_buffered_stream.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/threaded_buffered_stream.hh -------------------------------------------------------------------------------- /util/tokenize_piece.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/tokenize_piece.hh -------------------------------------------------------------------------------- /util/tokenize_piece_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/tokenize_piece_test.cc -------------------------------------------------------------------------------- /util/utf8.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/utf8.cc -------------------------------------------------------------------------------- /util/utf8.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/utf8.hh -------------------------------------------------------------------------------- /util/utf8_icu.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/utf8_icu.cc -------------------------------------------------------------------------------- /util/utf8_icu.hh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/utf8_icu.hh -------------------------------------------------------------------------------- /util/utf8_test.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kpu/preprocess/HEAD/util/utf8_test.cc --------------------------------------------------------------------------------