├── .gitignore ├── INSTALL.md ├── LICENSE.md ├── README.md ├── Results ├── .DS_Store ├── results-ev12.txt ├── results-ev2.txt ├── results-ev7.txt ├── results_11.txt ├── results_ev1.txt ├── results_ev4.xml ├── results_ev6.txt ├── results_ev8.txt ├── results_ev9.txt └── results_ev_10.txt ├── __init__.py ├── baseline ├── __init__.py ├── add_warc_locations.py ├── add_warc_locations.sh ├── baseline.md ├── bitextor_util │ ├── bitextorutil.py │ ├── lett2ridx_combine.py │ ├── lett2ridx_map.py │ ├── show_bitextor_docs.py │ └── wordcounts.py ├── candidates2bitextor.py ├── candidates2corpus.py ├── ccdownloader.py ├── check_lett_lang.py ├── collect_domains.py ├── corpus2corpus.py ├── corpus_by_domain.py ├── dedupe.sh ├── dictionary.md ├── download_and_align.sh ├── download_candidates.py ├── download_domain.py ├── dumptar.py ├── eval_sent.py ├── external_processor.py ├── extract_foreign_text.py ├── filter_emty_text_from_lett.py ├── filter_hunalign_bitext.py ├── filter_sent.py ├── filter_tmx.sh ├── find_pairs.py ├── html2text.py ├── langstat2candidates.py ├── languagestripper.py ├── lett2corpus.sh ├── lett2corpus_lowmem.sh ├── lett2ridx.py ├── lett_viewer.py ├── locate_candidates.py ├── locate_candidates_cc_index_api.py ├── match_url_pairs.py ├── ngrams.py ├── score_ngrams.py ├── strip_headers.py ├── strip_language_from_uri.py ├── tar2bitextor.py ├── tar2ett.py ├── text2langstats.py ├── textsanitzer.py ├── url_matching.py └── util │ ├── __init__.py │ └── encoding.py ├── candidates ├── computeMappings.py ├── extractCandidates-Christian.py ├── extractCandidates.py ├── extractInfo.py └── statistics.py ├── common_crawl_process.png ├── crawlertest ├── bitextor │ ├── extract_urls.py │ └── map_urls.py ├── bitextor_notes.txt ├── filename2url.py ├── httrack.sh └── httrack_pdf.sh ├── dicts ├── README.md ├── de-en.dic ├── dict_convert.py ├── en-de.dic ├── en-es.dic ├── en-fr.dic ├── en-it.dic ├── en-nl.dic ├── en-pt.dic ├── en-ru.dic ├── es-en.dic ├── filter_giza.py ├── fix_encoding.py ├── fr-en.dic ├── it-en.dic ├── nl-en.dic ├── pt-en.dic └── ru-en.dic ├── docalign_task └── eval_langid.py ├── docaligner ├── alignlett.py ├── counts2idf.py ├── eval_bitextor.py ├── eval_matrix.py ├── eval_model.py ├── extract.sh ├── extract_dev_feats.sh ├── extract_features.py ├── feature_matrix.py ├── hash_lines.py ├── htmlprocessor.py ├── lett.py ├── map_translations.py ├── matching.py ├── minmaxstd.py ├── nn.py ├── nnbloom.py ├── numpy_text2npz.py ├── numpy_text2npz.sh ├── page.py ├── ratio.py ├── scorer.py ├── split_long_short.py ├── table4paper.py ├── tokenizer.py └── train_classifier.py ├── html_convert ├── Makefile ├── anything_to_utf8.py ├── example │ ├── example.html │ └── example.html~ ├── header.h ├── html2text.cpp ├── langsplit.cpp └── string_util.h ├── merge └── metadata │ ├── __init__.py │ ├── add_lang_stats.py │ ├── drop_links_from_json.py │ ├── lang_stats │ ├── __init__.py │ └── percent_to_bytes.py │ └── read_wet.py ├── metadata ├── __init__.py ├── add_lang_stats.py ├── count_uniq_urls.py ├── drop_links_from_json.py ├── dump_keys.py ├── extract_links.sh ├── extract_location.sh ├── extract_monolingual.sh ├── extract_pdflinks.sh ├── insert_kv.py ├── lang_stats │ ├── __init__.py │ ├── accumulate_langstats.py │ ├── accumulate_stats.py │ ├── cld2helper.py │ ├── cld_lang_codes.txt │ ├── join_stats.py │ ├── old2new_stats.py │ └── percent_to_bytes.py ├── langstats2kv.py ├── leveldb │ ├── Makefile │ ├── Readme │ ├── insertkv.cc │ └── updatekv.cc ├── links_from_wat.py ├── md_server.py ├── meta_data_kv.sh ├── metadata.md ├── metadatabase.py ├── query_md.py ├── read_wet.py ├── rocksdb │ ├── Makefile │ ├── insertkv.cc │ ├── rdb_options.h │ └── updatekv.cc └── url_classifier │ ├── filter_features.py │ ├── filter_languages.py │ └── split_url.py ├── monolingual ├── README.md └── collect_lang.py ├── parseXML.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/.gitignore -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/INSTALL.md -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/LICENSE.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/README.md -------------------------------------------------------------------------------- /Results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/.DS_Store -------------------------------------------------------------------------------- /Results/results-ev12.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results-ev12.txt -------------------------------------------------------------------------------- /Results/results-ev2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results-ev2.txt -------------------------------------------------------------------------------- /Results/results-ev7.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results-ev7.txt -------------------------------------------------------------------------------- /Results/results_11.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_11.txt -------------------------------------------------------------------------------- /Results/results_ev1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_ev1.txt -------------------------------------------------------------------------------- /Results/results_ev4.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_ev4.xml -------------------------------------------------------------------------------- /Results/results_ev6.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_ev6.txt -------------------------------------------------------------------------------- /Results/results_ev8.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_ev8.txt -------------------------------------------------------------------------------- /Results/results_ev9.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_ev9.txt -------------------------------------------------------------------------------- /Results/results_ev_10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/Results/results_ev_10.txt -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baseline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baseline/add_warc_locations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/add_warc_locations.py -------------------------------------------------------------------------------- /baseline/add_warc_locations.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/add_warc_locations.sh -------------------------------------------------------------------------------- /baseline/baseline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/baseline.md -------------------------------------------------------------------------------- /baseline/bitextor_util/bitextorutil.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/bitextor_util/bitextorutil.py -------------------------------------------------------------------------------- /baseline/bitextor_util/lett2ridx_combine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/bitextor_util/lett2ridx_combine.py -------------------------------------------------------------------------------- /baseline/bitextor_util/lett2ridx_map.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/bitextor_util/lett2ridx_map.py -------------------------------------------------------------------------------- /baseline/bitextor_util/show_bitextor_docs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/bitextor_util/show_bitextor_docs.py -------------------------------------------------------------------------------- /baseline/bitextor_util/wordcounts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/bitextor_util/wordcounts.py -------------------------------------------------------------------------------- /baseline/candidates2bitextor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/candidates2bitextor.py -------------------------------------------------------------------------------- /baseline/candidates2corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/candidates2corpus.py -------------------------------------------------------------------------------- /baseline/ccdownloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/ccdownloader.py -------------------------------------------------------------------------------- /baseline/check_lett_lang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/check_lett_lang.py -------------------------------------------------------------------------------- /baseline/collect_domains.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/collect_domains.py -------------------------------------------------------------------------------- /baseline/corpus2corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/corpus2corpus.py -------------------------------------------------------------------------------- /baseline/corpus_by_domain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/corpus_by_domain.py -------------------------------------------------------------------------------- /baseline/dedupe.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/dedupe.sh -------------------------------------------------------------------------------- /baseline/dictionary.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/dictionary.md -------------------------------------------------------------------------------- /baseline/download_and_align.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/download_and_align.sh -------------------------------------------------------------------------------- /baseline/download_candidates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/download_candidates.py -------------------------------------------------------------------------------- /baseline/download_domain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/download_domain.py -------------------------------------------------------------------------------- /baseline/dumptar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/dumptar.py -------------------------------------------------------------------------------- /baseline/eval_sent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/eval_sent.py -------------------------------------------------------------------------------- /baseline/external_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/external_processor.py -------------------------------------------------------------------------------- /baseline/extract_foreign_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/extract_foreign_text.py -------------------------------------------------------------------------------- /baseline/filter_emty_text_from_lett.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/filter_emty_text_from_lett.py -------------------------------------------------------------------------------- /baseline/filter_hunalign_bitext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/filter_hunalign_bitext.py -------------------------------------------------------------------------------- /baseline/filter_sent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/filter_sent.py -------------------------------------------------------------------------------- /baseline/filter_tmx.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/filter_tmx.sh -------------------------------------------------------------------------------- /baseline/find_pairs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/find_pairs.py -------------------------------------------------------------------------------- /baseline/html2text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/html2text.py -------------------------------------------------------------------------------- /baseline/langstat2candidates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/langstat2candidates.py -------------------------------------------------------------------------------- /baseline/languagestripper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/languagestripper.py -------------------------------------------------------------------------------- /baseline/lett2corpus.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/lett2corpus.sh -------------------------------------------------------------------------------- /baseline/lett2corpus_lowmem.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/lett2corpus_lowmem.sh -------------------------------------------------------------------------------- /baseline/lett2ridx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/lett2ridx.py -------------------------------------------------------------------------------- /baseline/lett_viewer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/lett_viewer.py -------------------------------------------------------------------------------- /baseline/locate_candidates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/locate_candidates.py -------------------------------------------------------------------------------- /baseline/locate_candidates_cc_index_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/locate_candidates_cc_index_api.py -------------------------------------------------------------------------------- /baseline/match_url_pairs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/match_url_pairs.py -------------------------------------------------------------------------------- /baseline/ngrams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/ngrams.py -------------------------------------------------------------------------------- /baseline/score_ngrams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/score_ngrams.py -------------------------------------------------------------------------------- /baseline/strip_headers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/strip_headers.py -------------------------------------------------------------------------------- /baseline/strip_language_from_uri.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/strip_language_from_uri.py -------------------------------------------------------------------------------- /baseline/tar2bitextor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/tar2bitextor.py -------------------------------------------------------------------------------- /baseline/tar2ett.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/tar2ett.py -------------------------------------------------------------------------------- /baseline/text2langstats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/text2langstats.py -------------------------------------------------------------------------------- /baseline/textsanitzer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/textsanitzer.py -------------------------------------------------------------------------------- /baseline/url_matching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/url_matching.py -------------------------------------------------------------------------------- /baseline/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baseline/util/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/baseline/util/encoding.py -------------------------------------------------------------------------------- /candidates/computeMappings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/candidates/computeMappings.py -------------------------------------------------------------------------------- /candidates/extractCandidates-Christian.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/candidates/extractCandidates-Christian.py -------------------------------------------------------------------------------- /candidates/extractCandidates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/candidates/extractCandidates.py -------------------------------------------------------------------------------- /candidates/extractInfo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/candidates/extractInfo.py -------------------------------------------------------------------------------- /candidates/statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/candidates/statistics.py -------------------------------------------------------------------------------- /common_crawl_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/common_crawl_process.png -------------------------------------------------------------------------------- /crawlertest/bitextor/extract_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/crawlertest/bitextor/extract_urls.py -------------------------------------------------------------------------------- /crawlertest/bitextor/map_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/crawlertest/bitextor/map_urls.py -------------------------------------------------------------------------------- /crawlertest/bitextor_notes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/crawlertest/bitextor_notes.txt -------------------------------------------------------------------------------- /crawlertest/filename2url.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/crawlertest/filename2url.py -------------------------------------------------------------------------------- /crawlertest/httrack.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/crawlertest/httrack.sh -------------------------------------------------------------------------------- /crawlertest/httrack_pdf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/crawlertest/httrack_pdf.sh -------------------------------------------------------------------------------- /dicts/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/README.md -------------------------------------------------------------------------------- /dicts/de-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/de-en.dic -------------------------------------------------------------------------------- /dicts/dict_convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/dict_convert.py -------------------------------------------------------------------------------- /dicts/en-de.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-de.dic -------------------------------------------------------------------------------- /dicts/en-es.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-es.dic -------------------------------------------------------------------------------- /dicts/en-fr.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-fr.dic -------------------------------------------------------------------------------- /dicts/en-it.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-it.dic -------------------------------------------------------------------------------- /dicts/en-nl.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-nl.dic -------------------------------------------------------------------------------- /dicts/en-pt.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-pt.dic -------------------------------------------------------------------------------- /dicts/en-ru.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/en-ru.dic -------------------------------------------------------------------------------- /dicts/es-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/es-en.dic -------------------------------------------------------------------------------- /dicts/filter_giza.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/filter_giza.py -------------------------------------------------------------------------------- /dicts/fix_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/fix_encoding.py -------------------------------------------------------------------------------- /dicts/fr-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/fr-en.dic -------------------------------------------------------------------------------- /dicts/it-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/it-en.dic -------------------------------------------------------------------------------- /dicts/nl-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/nl-en.dic -------------------------------------------------------------------------------- /dicts/pt-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/pt-en.dic -------------------------------------------------------------------------------- /dicts/ru-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/dicts/ru-en.dic -------------------------------------------------------------------------------- /docalign_task/eval_langid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docalign_task/eval_langid.py -------------------------------------------------------------------------------- /docaligner/alignlett.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/alignlett.py -------------------------------------------------------------------------------- /docaligner/counts2idf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/counts2idf.py -------------------------------------------------------------------------------- /docaligner/eval_bitextor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/eval_bitextor.py -------------------------------------------------------------------------------- /docaligner/eval_matrix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/eval_matrix.py -------------------------------------------------------------------------------- /docaligner/eval_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/eval_model.py -------------------------------------------------------------------------------- /docaligner/extract.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/extract.sh -------------------------------------------------------------------------------- /docaligner/extract_dev_feats.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/extract_dev_feats.sh -------------------------------------------------------------------------------- /docaligner/extract_features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/extract_features.py -------------------------------------------------------------------------------- /docaligner/feature_matrix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/feature_matrix.py -------------------------------------------------------------------------------- /docaligner/hash_lines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/hash_lines.py -------------------------------------------------------------------------------- /docaligner/htmlprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/htmlprocessor.py -------------------------------------------------------------------------------- /docaligner/lett.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/lett.py -------------------------------------------------------------------------------- /docaligner/map_translations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/map_translations.py -------------------------------------------------------------------------------- /docaligner/matching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/matching.py -------------------------------------------------------------------------------- /docaligner/minmaxstd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/minmaxstd.py -------------------------------------------------------------------------------- /docaligner/nn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/nn.py -------------------------------------------------------------------------------- /docaligner/nnbloom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/nnbloom.py -------------------------------------------------------------------------------- /docaligner/numpy_text2npz.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/numpy_text2npz.py -------------------------------------------------------------------------------- /docaligner/numpy_text2npz.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/numpy_text2npz.sh -------------------------------------------------------------------------------- /docaligner/page.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/page.py -------------------------------------------------------------------------------- /docaligner/ratio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/ratio.py -------------------------------------------------------------------------------- /docaligner/scorer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/scorer.py -------------------------------------------------------------------------------- /docaligner/split_long_short.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/split_long_short.py -------------------------------------------------------------------------------- /docaligner/table4paper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/table4paper.py -------------------------------------------------------------------------------- /docaligner/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/tokenizer.py -------------------------------------------------------------------------------- /docaligner/train_classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/docaligner/train_classifier.py -------------------------------------------------------------------------------- /html_convert/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/Makefile -------------------------------------------------------------------------------- /html_convert/anything_to_utf8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/anything_to_utf8.py -------------------------------------------------------------------------------- /html_convert/example/example.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/example/example.html -------------------------------------------------------------------------------- /html_convert/example/example.html~: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/example/example.html~ -------------------------------------------------------------------------------- /html_convert/header.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/header.h -------------------------------------------------------------------------------- /html_convert/html2text.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/html2text.cpp -------------------------------------------------------------------------------- /html_convert/langsplit.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/langsplit.cpp -------------------------------------------------------------------------------- /html_convert/string_util.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/html_convert/string_util.h -------------------------------------------------------------------------------- /merge/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /merge/metadata/add_lang_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/merge/metadata/add_lang_stats.py -------------------------------------------------------------------------------- /merge/metadata/drop_links_from_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/merge/metadata/drop_links_from_json.py -------------------------------------------------------------------------------- /merge/metadata/lang_stats/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /merge/metadata/lang_stats/percent_to_bytes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/merge/metadata/lang_stats/percent_to_bytes.py -------------------------------------------------------------------------------- /merge/metadata/read_wet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/merge/metadata/read_wet.py -------------------------------------------------------------------------------- /metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metadata/add_lang_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/add_lang_stats.py -------------------------------------------------------------------------------- /metadata/count_uniq_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/count_uniq_urls.py -------------------------------------------------------------------------------- /metadata/drop_links_from_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/drop_links_from_json.py -------------------------------------------------------------------------------- /metadata/dump_keys.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/dump_keys.py -------------------------------------------------------------------------------- /metadata/extract_links.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/extract_links.sh -------------------------------------------------------------------------------- /metadata/extract_location.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/extract_location.sh -------------------------------------------------------------------------------- /metadata/extract_monolingual.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/extract_monolingual.sh -------------------------------------------------------------------------------- /metadata/extract_pdflinks.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/extract_pdflinks.sh -------------------------------------------------------------------------------- /metadata/insert_kv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/insert_kv.py -------------------------------------------------------------------------------- /metadata/lang_stats/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metadata/lang_stats/accumulate_langstats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/accumulate_langstats.py -------------------------------------------------------------------------------- /metadata/lang_stats/accumulate_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/accumulate_stats.py -------------------------------------------------------------------------------- /metadata/lang_stats/cld2helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/cld2helper.py -------------------------------------------------------------------------------- /metadata/lang_stats/cld_lang_codes.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/cld_lang_codes.txt -------------------------------------------------------------------------------- /metadata/lang_stats/join_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/join_stats.py -------------------------------------------------------------------------------- /metadata/lang_stats/old2new_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/old2new_stats.py -------------------------------------------------------------------------------- /metadata/lang_stats/percent_to_bytes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/lang_stats/percent_to_bytes.py -------------------------------------------------------------------------------- /metadata/langstats2kv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/langstats2kv.py -------------------------------------------------------------------------------- /metadata/leveldb/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/leveldb/Makefile -------------------------------------------------------------------------------- /metadata/leveldb/Readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/leveldb/Readme -------------------------------------------------------------------------------- /metadata/leveldb/insertkv.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/leveldb/insertkv.cc -------------------------------------------------------------------------------- /metadata/leveldb/updatekv.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/leveldb/updatekv.cc -------------------------------------------------------------------------------- /metadata/links_from_wat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/links_from_wat.py -------------------------------------------------------------------------------- /metadata/md_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/md_server.py -------------------------------------------------------------------------------- /metadata/meta_data_kv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/meta_data_kv.sh -------------------------------------------------------------------------------- /metadata/metadata.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/metadata.md -------------------------------------------------------------------------------- /metadata/metadatabase.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/metadatabase.py -------------------------------------------------------------------------------- /metadata/query_md.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/query_md.py -------------------------------------------------------------------------------- /metadata/read_wet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/read_wet.py -------------------------------------------------------------------------------- /metadata/rocksdb/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/rocksdb/Makefile -------------------------------------------------------------------------------- /metadata/rocksdb/insertkv.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/rocksdb/insertkv.cc -------------------------------------------------------------------------------- /metadata/rocksdb/rdb_options.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/rocksdb/rdb_options.h -------------------------------------------------------------------------------- /metadata/rocksdb/updatekv.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/rocksdb/updatekv.cc -------------------------------------------------------------------------------- /metadata/url_classifier/filter_features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/url_classifier/filter_features.py -------------------------------------------------------------------------------- /metadata/url_classifier/filter_languages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/url_classifier/filter_languages.py -------------------------------------------------------------------------------- /metadata/url_classifier/split_url.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/metadata/url_classifier/split_url.py -------------------------------------------------------------------------------- /monolingual/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/monolingual/README.md -------------------------------------------------------------------------------- /monolingual/collect_lang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/monolingual/collect_lang.py -------------------------------------------------------------------------------- /parseXML.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/parseXML.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/modernmt/DataCollection/HEAD/requirements.txt --------------------------------------------------------------------------------