├── .dockerignore ├── .github ├── screenshots │ ├── add-datasets.png │ ├── diff-filter-output.png │ ├── filter-datasets-lines.png │ ├── filter-datasets.png │ └── list-datasets.png └── workflows │ └── release.yaml ├── .gitignore ├── .vscode └── extensions.json ├── Dockerfile ├── README.md ├── dump-parameter-schema.py ├── frontend ├── README.md ├── index.html ├── package-lock.json ├── package.json ├── src │ ├── App.vue │ ├── assets │ │ ├── data-cuate.svg │ │ ├── datailor-logo.svg │ │ ├── empty-pana.svg │ │ ├── eu.png │ │ ├── horizon-europe.png │ │ └── main.css │ ├── components │ │ ├── CategoryPicker.vue │ │ ├── Checkbox.vue │ │ ├── DownloadPopup.vue │ │ ├── FilterEditor.vue │ │ ├── FilterOutputTable.vue │ │ ├── FilterStep.vue │ │ ├── InlineDiff.vue │ │ ├── LoadingIndicator.vue │ │ ├── Modal.vue │ │ ├── SegmentedControl.vue │ │ ├── TagsEditor.vue │ │ └── parameters │ │ │ ├── BoolParameter.vue │ │ │ ├── FloatParameter.vue │ │ │ ├── IntParameter.vue │ │ │ ├── ListParameter.vue │ │ │ ├── StringParameter.vue │ │ │ └── TupleParameter.vue │ ├── css │ │ ├── navbar.css │ │ └── property-list.css │ ├── diff.js │ ├── format.js │ ├── hacks.js │ ├── hash.js │ ├── interval.js │ ├── main.js │ ├── router │ │ └── index.js │ ├── store │ │ ├── categories.js │ │ ├── datasets.js │ │ ├── downloads.js │ │ ├── fetch.js │ │ ├── filters.js │ │ └── filtersteps.js │ ├── stream.js │ └── views │ │ ├── AddDatasetView.vue │ │ ├── EditFiltersView.vue │ │ ├── EditFiltersYamlView.vue │ │ └── ListDatasetsView.vue └── vite.config.js ├── opuscleaner ├── __about__.py ├── __init__.py ├── _util.py ├── categories.py ├── clean.py ├── col.py ├── config.py ├── datasets.py ├── download.py ├── filtering.py ├── filters │ ├── __init__.py │ ├── alpha_ratio.json │ ├── alpha_ratio.py │ ├── bicleaner_hardrules.json │ ├── bifixer.json │ ├── bifixer_dedupe.py │ ├── clean_common.py │ ├── currency_mismatch.json │ ├── currency_mismatch.py │ ├── deescape-special-chars.json │ ├── deescape-special-chars.perl │ ├── deescape_tsv.json │ ├── deescape_tsv.py │ ├── detokenizer.json │ ├── detokenizer.perl │ ├── fasttext_filter.json │ ├── fasttext_filter.py │ ├── fix_elitr_eca.json │ ├── fix_elitr_eca.py │ ├── fix_quotes.json │ ├── fix_quotes.py │ ├── fix_sent_final_punct.json │ ├── fix_sent_final_punct.py │ ├── fix_un_chinese.json │ ├── fix_un_chinese.py │ ├── fix_wiki.json │ ├── fix_wiki.py │ ├── langid.json │ ├── langid.py │ ├── langid_heliport.json │ ├── langid_heliport.py │ ├── laser_similarity.json │ ├── laser_similarity.py │ ├── max_length.json │ ├── max_length.py │ ├── max_word_length.json │ ├── max_word_length.py │ ├── normalize_whitespace.json │ ├── normalize_whitespace.py │ ├── num_mismatch.json │ ├── num_mismatch.py │ ├── opusfilter │ │ ├── AlphabetRatioFilter.json │ │ ├── AverageWordLengthFilter.json │ │ ├── CharacterScoreFilter.json │ │ ├── Detokenizer.json │ │ ├── HtmlTagFilter.json │ │ ├── LengthFilter.json │ │ ├── LengthRatioFilter.json │ │ ├── LongWordFilter.json │ │ ├── RegExpFilter.json │ │ ├── RegExpSub.json │ │ ├── Tokenizer.json │ │ ├── WhitespaceNormalizer.json │ │ └── opusfilter-ersatz.py │ ├── regexp.json │ ├── remove_empty_lines.json │ ├── remove_empty_lines.py │ ├── remove_frequent_patterns.json │ ├── remove_frequent_patterns.py │ ├── remove_frequent_patterns.txt │ ├── sed.json │ ├── segment_chinese.json │ ├── segment_chinese.py │ ├── segment_japanese.json │ ├── segment_japanese.py │ ├── simplify_chinese.json │ ├── split_sentences.json │ ├── split_sentences.py │ ├── src_trg_ratio.json │ ├── src_trg_ratio.py │ ├── strip_suffix.json │ ├── strip_suffix.py │ ├── test_currency_mismatch.py │ ├── test_num_mismatch.py │ ├── test_url_mismatch.py │ ├── traditionalise_chinese.json │ ├── url_mismatch.json │ └── url_mismatch.py ├── logging.py ├── opusfilter_compat.py ├── sample.py ├── scripts │ └── build_chars.py ├── server.py └── threshold.py ├── placeholders ├── README.md ├── placeholders.py ├── static │ ├── config.yml │ ├── test_encode_input │ ├── test_encode_input_strict │ └── vocab.fren.spm └── test.sh ├── pyproject.toml ├── requirements-all.txt ├── requirements.txt ├── test ├── deeper │ ├── data │ │ └── train-parts │ │ │ ├── bible-uedin-v1.de-en.de.gz │ │ │ └── bible-uedin-v1.de-en.en.gz │ └── filters │ │ └── fail.json ├── test_clean.py └── test_col.py └── utils ├── dedup ├── README.md ├── dedup.sh ├── hash-seg.py └── superdedup.py └── frontend_build_hook.py /.dockerignore: -------------------------------------------------------------------------------- 1 | /data 2 | /test 3 | /frontend/dist 4 | __pycache__ 5 | node_modules 6 | -------------------------------------------------------------------------------- /.github/screenshots/add-datasets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.github/screenshots/add-datasets.png -------------------------------------------------------------------------------- /.github/screenshots/diff-filter-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.github/screenshots/diff-filter-output.png -------------------------------------------------------------------------------- /.github/screenshots/filter-datasets-lines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.github/screenshots/filter-datasets-lines.png -------------------------------------------------------------------------------- /.github/screenshots/filter-datasets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.github/screenshots/filter-datasets.png -------------------------------------------------------------------------------- /.github/screenshots/list-datasets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.github/screenshots/list-datasets.png -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.github/workflows/release.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.gitignore -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/.vscode/extensions.json -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/Dockerfile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/README.md -------------------------------------------------------------------------------- /dump-parameter-schema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/dump-parameter-schema.py -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/README.md -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/index.html -------------------------------------------------------------------------------- /frontend/package-lock.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/package-lock.json -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/package.json -------------------------------------------------------------------------------- /frontend/src/App.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/App.vue -------------------------------------------------------------------------------- /frontend/src/assets/data-cuate.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/assets/data-cuate.svg -------------------------------------------------------------------------------- /frontend/src/assets/datailor-logo.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/assets/datailor-logo.svg -------------------------------------------------------------------------------- /frontend/src/assets/empty-pana.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/assets/empty-pana.svg -------------------------------------------------------------------------------- /frontend/src/assets/eu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/assets/eu.png -------------------------------------------------------------------------------- /frontend/src/assets/horizon-europe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/assets/horizon-europe.png -------------------------------------------------------------------------------- /frontend/src/assets/main.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/assets/main.css -------------------------------------------------------------------------------- /frontend/src/components/CategoryPicker.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/CategoryPicker.vue -------------------------------------------------------------------------------- /frontend/src/components/Checkbox.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/Checkbox.vue -------------------------------------------------------------------------------- /frontend/src/components/DownloadPopup.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/DownloadPopup.vue -------------------------------------------------------------------------------- /frontend/src/components/FilterEditor.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/FilterEditor.vue -------------------------------------------------------------------------------- /frontend/src/components/FilterOutputTable.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/FilterOutputTable.vue -------------------------------------------------------------------------------- /frontend/src/components/FilterStep.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/FilterStep.vue -------------------------------------------------------------------------------- /frontend/src/components/InlineDiff.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/InlineDiff.vue -------------------------------------------------------------------------------- /frontend/src/components/LoadingIndicator.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/LoadingIndicator.vue -------------------------------------------------------------------------------- /frontend/src/components/Modal.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/Modal.vue -------------------------------------------------------------------------------- /frontend/src/components/SegmentedControl.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/SegmentedControl.vue -------------------------------------------------------------------------------- /frontend/src/components/TagsEditor.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/TagsEditor.vue -------------------------------------------------------------------------------- /frontend/src/components/parameters/BoolParameter.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/parameters/BoolParameter.vue -------------------------------------------------------------------------------- /frontend/src/components/parameters/FloatParameter.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/parameters/FloatParameter.vue -------------------------------------------------------------------------------- /frontend/src/components/parameters/IntParameter.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/parameters/IntParameter.vue -------------------------------------------------------------------------------- /frontend/src/components/parameters/ListParameter.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/parameters/ListParameter.vue -------------------------------------------------------------------------------- /frontend/src/components/parameters/StringParameter.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/parameters/StringParameter.vue -------------------------------------------------------------------------------- /frontend/src/components/parameters/TupleParameter.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/components/parameters/TupleParameter.vue -------------------------------------------------------------------------------- /frontend/src/css/navbar.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/css/navbar.css -------------------------------------------------------------------------------- /frontend/src/css/property-list.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/css/property-list.css -------------------------------------------------------------------------------- /frontend/src/diff.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/diff.js -------------------------------------------------------------------------------- /frontend/src/format.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/format.js -------------------------------------------------------------------------------- /frontend/src/hacks.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/hacks.js -------------------------------------------------------------------------------- /frontend/src/hash.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/hash.js -------------------------------------------------------------------------------- /frontend/src/interval.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/interval.js -------------------------------------------------------------------------------- /frontend/src/main.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/main.js -------------------------------------------------------------------------------- /frontend/src/router/index.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/router/index.js -------------------------------------------------------------------------------- /frontend/src/store/categories.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/store/categories.js -------------------------------------------------------------------------------- /frontend/src/store/datasets.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/store/datasets.js -------------------------------------------------------------------------------- /frontend/src/store/downloads.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/store/downloads.js -------------------------------------------------------------------------------- /frontend/src/store/fetch.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/store/fetch.js -------------------------------------------------------------------------------- /frontend/src/store/filters.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/store/filters.js -------------------------------------------------------------------------------- /frontend/src/store/filtersteps.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/store/filtersteps.js -------------------------------------------------------------------------------- /frontend/src/stream.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/stream.js -------------------------------------------------------------------------------- /frontend/src/views/AddDatasetView.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/views/AddDatasetView.vue -------------------------------------------------------------------------------- /frontend/src/views/EditFiltersView.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/views/EditFiltersView.vue -------------------------------------------------------------------------------- /frontend/src/views/EditFiltersYamlView.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/views/EditFiltersYamlView.vue -------------------------------------------------------------------------------- /frontend/src/views/ListDatasetsView.vue: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/src/views/ListDatasetsView.vue -------------------------------------------------------------------------------- /frontend/vite.config.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/frontend/vite.config.js -------------------------------------------------------------------------------- /opuscleaner/__about__.py: -------------------------------------------------------------------------------- 1 | VERSION = "0.0.0" 2 | -------------------------------------------------------------------------------- /opuscleaner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /opuscleaner/_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/_util.py -------------------------------------------------------------------------------- /opuscleaner/categories.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/categories.py -------------------------------------------------------------------------------- /opuscleaner/clean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/clean.py -------------------------------------------------------------------------------- /opuscleaner/col.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/col.py -------------------------------------------------------------------------------- /opuscleaner/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/config.py -------------------------------------------------------------------------------- /opuscleaner/datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/datasets.py -------------------------------------------------------------------------------- /opuscleaner/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/download.py -------------------------------------------------------------------------------- /opuscleaner/filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filtering.py -------------------------------------------------------------------------------- /opuscleaner/filters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /opuscleaner/filters/alpha_ratio.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/alpha_ratio.json -------------------------------------------------------------------------------- /opuscleaner/filters/alpha_ratio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/alpha_ratio.py -------------------------------------------------------------------------------- /opuscleaner/filters/bicleaner_hardrules.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/bicleaner_hardrules.json -------------------------------------------------------------------------------- /opuscleaner/filters/bifixer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/bifixer.json -------------------------------------------------------------------------------- /opuscleaner/filters/bifixer_dedupe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/bifixer_dedupe.py -------------------------------------------------------------------------------- /opuscleaner/filters/clean_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/clean_common.py -------------------------------------------------------------------------------- /opuscleaner/filters/currency_mismatch.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/currency_mismatch.json -------------------------------------------------------------------------------- /opuscleaner/filters/currency_mismatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/currency_mismatch.py -------------------------------------------------------------------------------- /opuscleaner/filters/deescape-special-chars.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/deescape-special-chars.json -------------------------------------------------------------------------------- /opuscleaner/filters/deescape-special-chars.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/deescape-special-chars.perl -------------------------------------------------------------------------------- /opuscleaner/filters/deescape_tsv.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/deescape_tsv.json -------------------------------------------------------------------------------- /opuscleaner/filters/deescape_tsv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/deescape_tsv.py -------------------------------------------------------------------------------- /opuscleaner/filters/detokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/detokenizer.json -------------------------------------------------------------------------------- /opuscleaner/filters/detokenizer.perl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/detokenizer.perl -------------------------------------------------------------------------------- /opuscleaner/filters/fasttext_filter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fasttext_filter.json -------------------------------------------------------------------------------- /opuscleaner/filters/fasttext_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fasttext_filter.py -------------------------------------------------------------------------------- /opuscleaner/filters/fix_elitr_eca.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_elitr_eca.json -------------------------------------------------------------------------------- /opuscleaner/filters/fix_elitr_eca.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_elitr_eca.py -------------------------------------------------------------------------------- /opuscleaner/filters/fix_quotes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_quotes.json -------------------------------------------------------------------------------- /opuscleaner/filters/fix_quotes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_quotes.py -------------------------------------------------------------------------------- /opuscleaner/filters/fix_sent_final_punct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_sent_final_punct.json -------------------------------------------------------------------------------- /opuscleaner/filters/fix_sent_final_punct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_sent_final_punct.py -------------------------------------------------------------------------------- /opuscleaner/filters/fix_un_chinese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_un_chinese.json -------------------------------------------------------------------------------- /opuscleaner/filters/fix_un_chinese.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_un_chinese.py -------------------------------------------------------------------------------- /opuscleaner/filters/fix_wiki.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_wiki.json -------------------------------------------------------------------------------- /opuscleaner/filters/fix_wiki.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/fix_wiki.py -------------------------------------------------------------------------------- /opuscleaner/filters/langid.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/langid.json -------------------------------------------------------------------------------- /opuscleaner/filters/langid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/langid.py -------------------------------------------------------------------------------- /opuscleaner/filters/langid_heliport.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/langid_heliport.json -------------------------------------------------------------------------------- /opuscleaner/filters/langid_heliport.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/langid_heliport.py -------------------------------------------------------------------------------- /opuscleaner/filters/laser_similarity.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/laser_similarity.json -------------------------------------------------------------------------------- /opuscleaner/filters/laser_similarity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/laser_similarity.py -------------------------------------------------------------------------------- /opuscleaner/filters/max_length.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/max_length.json -------------------------------------------------------------------------------- /opuscleaner/filters/max_length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/max_length.py -------------------------------------------------------------------------------- /opuscleaner/filters/max_word_length.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/max_word_length.json -------------------------------------------------------------------------------- /opuscleaner/filters/max_word_length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/max_word_length.py -------------------------------------------------------------------------------- /opuscleaner/filters/normalize_whitespace.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/normalize_whitespace.json -------------------------------------------------------------------------------- /opuscleaner/filters/normalize_whitespace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/normalize_whitespace.py -------------------------------------------------------------------------------- /opuscleaner/filters/num_mismatch.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/num_mismatch.json -------------------------------------------------------------------------------- /opuscleaner/filters/num_mismatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/num_mismatch.py -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/AlphabetRatioFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/AlphabetRatioFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/AverageWordLengthFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/AverageWordLengthFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/CharacterScoreFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/CharacterScoreFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/Detokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/Detokenizer.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/HtmlTagFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/HtmlTagFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/LengthFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/LengthFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/LengthRatioFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/LengthRatioFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/LongWordFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/LongWordFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/RegExpFilter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/RegExpFilter.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/RegExpSub.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/RegExpSub.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/Tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/Tokenizer.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/WhitespaceNormalizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/WhitespaceNormalizer.json -------------------------------------------------------------------------------- /opuscleaner/filters/opusfilter/opusfilter-ersatz.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/opusfilter/opusfilter-ersatz.py -------------------------------------------------------------------------------- /opuscleaner/filters/regexp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/regexp.json -------------------------------------------------------------------------------- /opuscleaner/filters/remove_empty_lines.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/remove_empty_lines.json -------------------------------------------------------------------------------- /opuscleaner/filters/remove_empty_lines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/remove_empty_lines.py -------------------------------------------------------------------------------- /opuscleaner/filters/remove_frequent_patterns.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/remove_frequent_patterns.json -------------------------------------------------------------------------------- /opuscleaner/filters/remove_frequent_patterns.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/remove_frequent_patterns.py -------------------------------------------------------------------------------- /opuscleaner/filters/remove_frequent_patterns.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/remove_frequent_patterns.txt -------------------------------------------------------------------------------- /opuscleaner/filters/sed.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/sed.json -------------------------------------------------------------------------------- /opuscleaner/filters/segment_chinese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/segment_chinese.json -------------------------------------------------------------------------------- /opuscleaner/filters/segment_chinese.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/segment_chinese.py -------------------------------------------------------------------------------- /opuscleaner/filters/segment_japanese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/segment_japanese.json -------------------------------------------------------------------------------- /opuscleaner/filters/segment_japanese.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/segment_japanese.py -------------------------------------------------------------------------------- /opuscleaner/filters/simplify_chinese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/simplify_chinese.json -------------------------------------------------------------------------------- /opuscleaner/filters/split_sentences.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/split_sentences.json -------------------------------------------------------------------------------- /opuscleaner/filters/split_sentences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/split_sentences.py -------------------------------------------------------------------------------- /opuscleaner/filters/src_trg_ratio.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/src_trg_ratio.json -------------------------------------------------------------------------------- /opuscleaner/filters/src_trg_ratio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/src_trg_ratio.py -------------------------------------------------------------------------------- /opuscleaner/filters/strip_suffix.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/strip_suffix.json -------------------------------------------------------------------------------- /opuscleaner/filters/strip_suffix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/strip_suffix.py -------------------------------------------------------------------------------- /opuscleaner/filters/test_currency_mismatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/test_currency_mismatch.py -------------------------------------------------------------------------------- /opuscleaner/filters/test_num_mismatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/test_num_mismatch.py -------------------------------------------------------------------------------- /opuscleaner/filters/test_url_mismatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/test_url_mismatch.py -------------------------------------------------------------------------------- /opuscleaner/filters/traditionalise_chinese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/traditionalise_chinese.json -------------------------------------------------------------------------------- /opuscleaner/filters/url_mismatch.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/url_mismatch.json -------------------------------------------------------------------------------- /opuscleaner/filters/url_mismatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/filters/url_mismatch.py -------------------------------------------------------------------------------- /opuscleaner/logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/logging.py -------------------------------------------------------------------------------- /opuscleaner/opusfilter_compat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/opusfilter_compat.py -------------------------------------------------------------------------------- /opuscleaner/sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/sample.py -------------------------------------------------------------------------------- /opuscleaner/scripts/build_chars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/scripts/build_chars.py -------------------------------------------------------------------------------- /opuscleaner/server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/server.py -------------------------------------------------------------------------------- /opuscleaner/threshold.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/opuscleaner/threshold.py -------------------------------------------------------------------------------- /placeholders/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/README.md -------------------------------------------------------------------------------- /placeholders/placeholders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/placeholders.py -------------------------------------------------------------------------------- /placeholders/static/config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/static/config.yml -------------------------------------------------------------------------------- /placeholders/static/test_encode_input: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/static/test_encode_input -------------------------------------------------------------------------------- /placeholders/static/test_encode_input_strict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/static/test_encode_input_strict -------------------------------------------------------------------------------- /placeholders/static/vocab.fren.spm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/static/vocab.fren.spm -------------------------------------------------------------------------------- /placeholders/test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/placeholders/test.sh -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements-all.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/requirements-all.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/requirements.txt -------------------------------------------------------------------------------- /test/deeper/data/train-parts/bible-uedin-v1.de-en.de.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/test/deeper/data/train-parts/bible-uedin-v1.de-en.de.gz -------------------------------------------------------------------------------- /test/deeper/data/train-parts/bible-uedin-v1.de-en.en.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/test/deeper/data/train-parts/bible-uedin-v1.de-en.en.gz -------------------------------------------------------------------------------- /test/deeper/filters/fail.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/test/deeper/filters/fail.json -------------------------------------------------------------------------------- /test/test_clean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/test/test_clean.py -------------------------------------------------------------------------------- /test/test_col.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/test/test_col.py -------------------------------------------------------------------------------- /utils/dedup/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/utils/dedup/README.md -------------------------------------------------------------------------------- /utils/dedup/dedup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/utils/dedup/dedup.sh -------------------------------------------------------------------------------- /utils/dedup/hash-seg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/utils/dedup/hash-seg.py -------------------------------------------------------------------------------- /utils/dedup/superdedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/utils/dedup/superdedup.py -------------------------------------------------------------------------------- /utils/frontend_build_hook.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hplt-project/OpusCleaner/HEAD/utils/frontend_build_hook.py --------------------------------------------------------------------------------