├── .github └── workflows │ ├── add-issue-to-project.yml │ ├── label-with-contact-neede.yml │ ├── label-with-help-wanted.yml │ ├── pii-manager.yml │ └── self-assign.yaml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── __init__.py ├── ac_dc ├── README.md ├── anonymization.py ├── deduplicate │ ├── README.md │ ├── conf │ │ ├── self_deduplicate_ar.yaml │ │ ├── self_deduplicate_bn.yaml │ │ ├── self_deduplicate_ca.yaml │ │ ├── self_deduplicate_en.yaml │ │ ├── self_deduplicate_es.yaml │ │ ├── self_deduplicate_eu.yaml │ │ ├── self_deduplicate_fr.yaml │ │ ├── self_deduplicate_gl.yaml │ │ ├── self_deduplicate_hi.yaml │ │ ├── self_deduplicate_id.yaml │ │ ├── self_deduplicate_pt.yaml │ │ ├── self_deduplicate_ur.yaml │ │ ├── self_deduplicate_vi.yaml │ │ └── self_deduplicate_zh.yaml │ ├── deduplicate │ │ ├── __init__.py │ │ └── util.py │ ├── self_deduplicate.py │ └── visualize.ipynb ├── download_sentencepiece_kenlm_models.py ├── explanation_filtering_pipeline.pdf ├── filtering.py ├── flagged_words.py ├── languages_id.py ├── main_filtering.py ├── normalization.py ├── parameters_filtering.py ├── person_and_id_anonymization.py ├── stopwords.py ├── test_anonymization.py └── visualization │ ├── README.md │ ├── get_data_for_visualization.py │ └── visualization.py ├── bertin ├── README.md ├── config.json ├── config.py ├── configs │ ├── base │ │ ├── config.json │ │ └── tokenizer.json │ └── large │ │ ├── config.json │ │ └── tokenizer.json ├── convert.py ├── evaluation │ ├── paws.yaml │ ├── run_glue.py │ ├── run_ner.ipynb │ ├── run_ner.py │ ├── token.yaml │ └── xnli.yaml ├── events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2 ├── events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2 ├── events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2 ├── get_embeddings_and_perplexity.py ├── images │ ├── bertin-tilt.png │ ├── bertin.png │ ├── ccnet.png │ ├── datasets-perp-20-120.png │ ├── datasets-perp.png │ ├── datasets-random-comparison.png │ ├── datasets-wsize.png │ ├── perp-p95.png │ ├── perp-resample-gaussian.png │ ├── perp-resample-stepwise.png │ ├── perplexity_colored_embeddings.html │ └── random_512.jpg ├── mc4 │ ├── README.md │ ├── dummy │ │ └── af │ │ │ └── 0.0.0 │ │ │ └── dummy_data.zip │ └── mc4.py ├── merges.txt ├── perplexity.py ├── run.sh ├── run_mlm_flax.py ├── run_mlm_flax_stream.py ├── run_stream.sh ├── special_tokens_map.json ├── tokenizer.json ├── tokenizer_config.json ├── tokens.py ├── tokens.py.orig ├── tsne_plot.py ├── utils │ ├── dataset_perplexity.py │ ├── download_mc4es_sampled.py │ └── generate_datasets.py └── vocab.json ├── cc_pseudo_crawl ├── get_stats.py ├── language_annotation │ ├── python_scripts │ │ ├── annotate_langid_crawl.py │ │ ├── check_wrong_files.py │ │ ├── compute_stats_langid.py │ │ └── detect_html_lang_attrib.py │ └── slurm_scripts │ │ ├── 02_detect_html_lang_attrib.slurm │ │ └── job_annotate_langid_crawl.sh ├── processing_notebooks │ ├── NigerCongoDS.ipynb │ └── pseudocrawl_nigercongo.ipynb ├── python_scripts │ ├── cc_lookup_next.py │ ├── cc_lookup_seed.py │ ├── check_erros_in_dataset.py │ ├── deeper.py │ ├── divide_in_shards.py │ ├── download_warc.py │ ├── exact_deduplicates.py │ ├── extract_text │ │ ├── extract_text_and_html_metadata.py │ │ └── requirements.txt │ ├── finalise.py │ ├── load_all_seed_ids.py │ ├── merge_seed_shards.py │ ├── preprocess_dataset.py │ ├── process_for_concatenation.py │ ├── pseudo_crawl_seed_to_lm_dset.py │ ├── pseudo_crawl_seed_to_lm_dset_v2.py │ ├── redownload_warc.py │ ├── requirements.txt │ ├── shard_and_compress.py │ └── shard_by_seed_id.py ├── seeds_batch_1 │ ├── .gitignore │ ├── DEPTH.md │ ├── README.md │ ├── slurm_scripts │ │ ├── check_errors_in_dataset.slurm │ │ ├── divide_in_subshards.slurm │ │ ├── divide_in_subshards_1000.slurm │ │ ├── download_warc.slurm │ │ ├── download_warc_too_big.slurm │ │ ├── download_warc_trial_4.slurm │ │ ├── download_warc_trial_5.slurm │ │ ├── extract_text_and_html_metadata.slurm │ │ ├── merge_seed_shards.slurm │ │ ├── preprocess_warc.slurm │ │ ├── redownload_warc.slurm │ │ ├── shard_and_compress.slurm │ │ └── shard_by_seed_id.slurm │ └── sourcing_sheet_seeds │ │ ├── README.md │ │ ├── candidate_websites_for_crawling.csv │ │ ├── cc-metrics.csv │ │ ├── cc-metrics.ipynb │ │ ├── cleanup-seeds.ipynb │ │ ├── filtered_catalogue.json │ │ ├── preprocess_dataset.ipynb │ │ ├── seeds.csv │ │ └── test_preprcessing_via_pyarrow_pandas.ipynb ├── seeds_batch_1_2 │ ├── 00_clean_dataset.slurm │ └── 01_exact_deduplicates.slurm └── seeds_batch_2 │ ├── .gitignore │ ├── README.md │ ├── slurm_scripts │ ├── 01_download_warc.slurm │ ├── 02_redownload_warc.slurm │ ├── 02b_redownload_warc.slurm │ ├── 03_check_errors_in_dataset.slurm │ ├── 04_divide_in_subshards.slurm │ ├── 05_preprocess_warc.slurm │ ├── 06_extract_text_and_html_metadata.slurm │ ├── 07_shard_by_seed_id.slurm │ ├── 08_merge_seed_shards.slurm │ ├── 09_shard_and_compress.slurm │ └── 10_push_to_hub.slurm │ └── sourcing_sheet_seeds │ ├── cleanup-seeds.ipynb │ ├── seeds.csv │ ├── seeds_batch_2.csv │ └── seeds_batch_2.json ├── index_search ├── README.md ├── datasets_ES_builder.py ├── datasets_ES_index.py ├── datasets_ES_search.py ├── datasets_remote_ES_IBMcloud.py ├── docker-compose.yml └── requirements.txt ├── kenlm_training ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── cc_net │ ├── __init__.py │ ├── __main__.py │ ├── data │ │ ├── cutoff.csv │ │ └── test_stats.json │ ├── dedup.py │ ├── execution.py │ ├── flat_hash_set.py │ ├── get_hf_dataset.py │ ├── get_wiki_cirrus.py │ ├── jsonql.py │ ├── mine.py │ ├── minify.py │ ├── perplexity.py │ ├── process_wet_file.py │ ├── regroup.py │ ├── split_by_lang.py │ ├── text_normalizer.py │ ├── tokenizer.py │ └── tools │ │ ├── __init__.py │ │ ├── dl_cc_100.py │ │ ├── expand_corpus.py │ │ └── make_dmoz_corpus.py ├── config │ ├── lid_exp.json │ ├── mine_segment.json │ ├── test_reproduce.json │ └── test_segment.json ├── pyproject.toml ├── setup.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── data │ │ └── sample.warc.txt │ ├── test_dedup.py │ ├── test_flat_hash_set.py │ ├── test_jsonql.py │ ├── test_minify.py │ ├── test_normalizer.py │ ├── test_parse_wet_file.py │ ├── test_regroup.py │ └── test_transformer.py └── train_all.sh ├── perplexity_lenses ├── README.md ├── app.py ├── cli.py ├── perplexity_lenses │ ├── __init__.py │ ├── data.py │ ├── engine.py │ ├── perplexity.py │ └── visualization.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tests │ ├── __init__.py │ └── test_data.py ├── pii-manager ├── .gitignore ├── CHANGES.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── doc │ ├── contributing.md │ ├── external.md │ ├── tasks.md │ └── usage.md ├── requirements.txt ├── setup.py ├── src │ └── pii_manager │ │ ├── __init__.py │ │ ├── api │ │ ├── __init__.py │ │ ├── file.py │ │ └── manager.py │ │ ├── app │ │ ├── __init__.py │ │ ├── manage.py │ │ └── task_info.py │ │ ├── helper │ │ ├── __init__.py │ │ ├── base.py │ │ ├── context.py │ │ ├── exception.py │ │ ├── json.py │ │ ├── normalizer.py │ │ ├── taskdict.py │ │ └── types.py │ │ ├── lang │ │ ├── __init__.py │ │ ├── any │ │ │ ├── __init__.py │ │ │ ├── bitcoin_address.py │ │ │ ├── credit_card.py │ │ │ ├── email.py │ │ │ └── ip_address.py │ │ ├── en │ │ │ ├── __init__.py │ │ │ ├── any │ │ │ │ ├── __init__.py │ │ │ │ └── international_phone_number.py │ │ │ ├── au │ │ │ │ ├── __init__.py │ │ │ │ ├── abn.py │ │ │ │ └── tfn.py │ │ │ ├── ca │ │ │ │ ├── __init__.py │ │ │ │ └── social_insurance_number.py │ │ │ ├── in_ │ │ │ │ ├── __init__.py │ │ │ │ └── aadhaar.py │ │ │ └── us │ │ │ │ ├── __init__.py │ │ │ │ └── social_security_number.py │ │ ├── es │ │ │ ├── __init__.py │ │ │ ├── any │ │ │ │ ├── __init__.py │ │ │ │ └── international_phone_number.py │ │ │ ├── es │ │ │ │ ├── __init__.py │ │ │ │ ├── bank_account.py │ │ │ │ └── govid.py │ │ │ └── mx │ │ │ │ ├── __init__.py │ │ │ │ └── curp.py │ │ ├── fr │ │ │ ├── __init__.py │ │ │ └── ca │ │ │ │ ├── __init__.py │ │ │ │ └── social_insurance_number.py │ │ ├── pt │ │ │ ├── __init__.py │ │ │ ├── br │ │ │ │ ├── __init__.py │ │ │ │ └── cpf.py │ │ │ └── pt │ │ │ │ ├── __init__.py │ │ │ │ └── govid.py │ │ └── zh │ │ │ ├── __init__.py │ │ │ └── cn │ │ │ ├── __init__.py │ │ │ ├── gov_id.py │ │ │ └── misc.py │ │ ├── piientity.py │ │ └── piienum.py └── test │ ├── data │ ├── extract-block.ndjson │ ├── extract-line.ndjson │ ├── extract-sentence.ndjson │ ├── full-block.ndjson │ ├── full-line.ndjson │ ├── full-sentence.ndjson │ ├── orig.txt │ ├── replace.txt │ ├── tag.txt │ ├── taskfile-error.json │ └── taskfile.json │ └── unit │ ├── api │ ├── test_file.py │ ├── test_file_taskfile.py │ ├── test_manager.py │ ├── test_manager_add.py │ └── test_manager_ctx.py │ ├── helper │ ├── test_base.py │ ├── test_context.py │ ├── test_norm.py │ └── test_taskdict.py │ └── lang │ ├── any │ ├── test_bitcoin_address.py │ ├── test_credit_card.py │ ├── test_email.py │ └── test_ip_address.py │ ├── en │ ├── any │ │ └── test_ipn_en.py │ ├── au │ │ ├── test_abn.py │ │ └── test_tfn.py │ ├── ca │ │ └── test_sin.py │ ├── in_ │ │ └── test_aadhaar.py │ └── us │ │ └── test_ssn.py │ ├── es │ ├── any │ │ └── test_ipn_es.py │ ├── es │ │ ├── test_bank_account.py │ │ └── test_govid_es_es.py │ └── mx │ │ └── test_govid_es_mx.py │ ├── pt │ ├── br │ │ └── test_govid_pt_br.py │ └── pt │ │ └── test_govid_pt_pt.py │ └── zh │ └── cn │ ├── test_govid_zh_cn.py │ └── test_misc.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tokenizer ├── python_script ├── dedup_exact_article.py ├── dedup_lines.py ├── ram_dedup_lines.py └── requirements.txt └── scripts ├── 01_remove_deplicated_lines.sh ├── 02_remove_duplicated_lines_dataset_with_dataset_source.sh ├── 03_remove_duplicated_lines_alpha.sh ├── 04_remove_duplicated_lines_alpha _memory.sh ├── 05_remove_duplicated_lines_alpha __v2_memory.sh └── 06_dedup_exact_examples.sh /.github/workflows/add-issue-to-project.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.github/workflows/add-issue-to-project.yml -------------------------------------------------------------------------------- /.github/workflows/label-with-contact-neede.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.github/workflows/label-with-contact-neede.yml -------------------------------------------------------------------------------- /.github/workflows/label-with-help-wanted.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.github/workflows/label-with-help-wanted.yml -------------------------------------------------------------------------------- /.github/workflows/pii-manager.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.github/workflows/pii-manager.yml -------------------------------------------------------------------------------- /.github/workflows/self-assign.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.github/workflows/self-assign.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.gitmodules -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/README.md -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ac_dc/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/README.md -------------------------------------------------------------------------------- /ac_dc/anonymization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/anonymization.py -------------------------------------------------------------------------------- /ac_dc/deduplicate/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/README.md -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_ar.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_ar.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_bn.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_bn.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_ca.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_ca.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_en.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_en.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_es.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_es.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_eu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_eu.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_fr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_fr.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_gl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_gl.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_hi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_hi.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_id.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_id.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_pt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_pt.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_ur.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_ur.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_vi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_vi.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_zh.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/conf/self_deduplicate_zh.yaml -------------------------------------------------------------------------------- /ac_dc/deduplicate/deduplicate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/deduplicate/__init__.py -------------------------------------------------------------------------------- /ac_dc/deduplicate/deduplicate/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/deduplicate/util.py -------------------------------------------------------------------------------- /ac_dc/deduplicate/self_deduplicate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/self_deduplicate.py -------------------------------------------------------------------------------- /ac_dc/deduplicate/visualize.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/deduplicate/visualize.ipynb -------------------------------------------------------------------------------- /ac_dc/download_sentencepiece_kenlm_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/download_sentencepiece_kenlm_models.py -------------------------------------------------------------------------------- /ac_dc/explanation_filtering_pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/explanation_filtering_pipeline.pdf -------------------------------------------------------------------------------- /ac_dc/filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/filtering.py -------------------------------------------------------------------------------- /ac_dc/flagged_words.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/flagged_words.py -------------------------------------------------------------------------------- /ac_dc/languages_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/languages_id.py -------------------------------------------------------------------------------- /ac_dc/main_filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/main_filtering.py -------------------------------------------------------------------------------- /ac_dc/normalization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/normalization.py -------------------------------------------------------------------------------- /ac_dc/parameters_filtering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/parameters_filtering.py -------------------------------------------------------------------------------- /ac_dc/person_and_id_anonymization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/person_and_id_anonymization.py -------------------------------------------------------------------------------- /ac_dc/stopwords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/stopwords.py -------------------------------------------------------------------------------- /ac_dc/test_anonymization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/test_anonymization.py -------------------------------------------------------------------------------- /ac_dc/visualization/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/visualization/README.md -------------------------------------------------------------------------------- /ac_dc/visualization/get_data_for_visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/visualization/get_data_for_visualization.py -------------------------------------------------------------------------------- /ac_dc/visualization/visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/ac_dc/visualization/visualization.py -------------------------------------------------------------------------------- /bertin/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/README.md -------------------------------------------------------------------------------- /bertin/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/config.json -------------------------------------------------------------------------------- /bertin/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/config.py -------------------------------------------------------------------------------- /bertin/configs/base/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/configs/base/config.json -------------------------------------------------------------------------------- /bertin/configs/base/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/configs/base/tokenizer.json -------------------------------------------------------------------------------- /bertin/configs/large/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/configs/large/config.json -------------------------------------------------------------------------------- /bertin/configs/large/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/configs/large/tokenizer.json -------------------------------------------------------------------------------- /bertin/convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/convert.py -------------------------------------------------------------------------------- /bertin/evaluation/paws.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/evaluation/paws.yaml -------------------------------------------------------------------------------- /bertin/evaluation/run_glue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/evaluation/run_glue.py -------------------------------------------------------------------------------- /bertin/evaluation/run_ner.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/evaluation/run_ner.ipynb -------------------------------------------------------------------------------- /bertin/evaluation/run_ner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/evaluation/run_ner.py -------------------------------------------------------------------------------- /bertin/evaluation/token.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/evaluation/token.yaml -------------------------------------------------------------------------------- /bertin/evaluation/xnli.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/evaluation/xnli.yaml -------------------------------------------------------------------------------- /bertin/events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2 -------------------------------------------------------------------------------- /bertin/events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2 -------------------------------------------------------------------------------- /bertin/events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2 -------------------------------------------------------------------------------- /bertin/get_embeddings_and_perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/get_embeddings_and_perplexity.py -------------------------------------------------------------------------------- /bertin/images/bertin-tilt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/bertin-tilt.png -------------------------------------------------------------------------------- /bertin/images/bertin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/bertin.png -------------------------------------------------------------------------------- /bertin/images/ccnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/ccnet.png -------------------------------------------------------------------------------- /bertin/images/datasets-perp-20-120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/datasets-perp-20-120.png -------------------------------------------------------------------------------- /bertin/images/datasets-perp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/datasets-perp.png -------------------------------------------------------------------------------- /bertin/images/datasets-random-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/datasets-random-comparison.png -------------------------------------------------------------------------------- /bertin/images/datasets-wsize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/datasets-wsize.png -------------------------------------------------------------------------------- /bertin/images/perp-p95.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/perp-p95.png -------------------------------------------------------------------------------- /bertin/images/perp-resample-gaussian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/perp-resample-gaussian.png -------------------------------------------------------------------------------- /bertin/images/perp-resample-stepwise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/perp-resample-stepwise.png -------------------------------------------------------------------------------- /bertin/images/perplexity_colored_embeddings.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/perplexity_colored_embeddings.html -------------------------------------------------------------------------------- /bertin/images/random_512.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/images/random_512.jpg -------------------------------------------------------------------------------- /bertin/mc4/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/mc4/README.md -------------------------------------------------------------------------------- /bertin/mc4/dummy/af/0.0.0/dummy_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/mc4/dummy/af/0.0.0/dummy_data.zip -------------------------------------------------------------------------------- /bertin/mc4/mc4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/mc4/mc4.py -------------------------------------------------------------------------------- /bertin/merges.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/merges.txt -------------------------------------------------------------------------------- /bertin/perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/perplexity.py -------------------------------------------------------------------------------- /bertin/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/run.sh -------------------------------------------------------------------------------- /bertin/run_mlm_flax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/run_mlm_flax.py -------------------------------------------------------------------------------- /bertin/run_mlm_flax_stream.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/run_mlm_flax_stream.py -------------------------------------------------------------------------------- /bertin/run_stream.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/run_stream.sh -------------------------------------------------------------------------------- /bertin/special_tokens_map.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/special_tokens_map.json -------------------------------------------------------------------------------- /bertin/tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/tokenizer.json -------------------------------------------------------------------------------- /bertin/tokenizer_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/tokenizer_config.json -------------------------------------------------------------------------------- /bertin/tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/tokens.py -------------------------------------------------------------------------------- /bertin/tokens.py.orig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/tokens.py.orig -------------------------------------------------------------------------------- /bertin/tsne_plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/tsne_plot.py -------------------------------------------------------------------------------- /bertin/utils/dataset_perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/utils/dataset_perplexity.py -------------------------------------------------------------------------------- /bertin/utils/download_mc4es_sampled.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/utils/download_mc4es_sampled.py -------------------------------------------------------------------------------- /bertin/utils/generate_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/utils/generate_datasets.py -------------------------------------------------------------------------------- /bertin/vocab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/bertin/vocab.json -------------------------------------------------------------------------------- /cc_pseudo_crawl/get_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/get_stats.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/python_scripts/annotate_langid_crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/language_annotation/python_scripts/annotate_langid_crawl.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/python_scripts/check_wrong_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/language_annotation/python_scripts/check_wrong_files.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/python_scripts/compute_stats_langid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/language_annotation/python_scripts/compute_stats_langid.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/python_scripts/detect_html_lang_attrib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/language_annotation/python_scripts/detect_html_lang_attrib.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/slurm_scripts/02_detect_html_lang_attrib.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/language_annotation/slurm_scripts/02_detect_html_lang_attrib.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/slurm_scripts/job_annotate_langid_crawl.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/language_annotation/slurm_scripts/job_annotate_langid_crawl.sh -------------------------------------------------------------------------------- /cc_pseudo_crawl/processing_notebooks/NigerCongoDS.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/processing_notebooks/NigerCongoDS.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/processing_notebooks/pseudocrawl_nigercongo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/processing_notebooks/pseudocrawl_nigercongo.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/cc_lookup_next.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/cc_lookup_next.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/cc_lookup_seed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/cc_lookup_seed.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/check_erros_in_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/check_erros_in_dataset.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/deeper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/deeper.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/divide_in_shards.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/divide_in_shards.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/download_warc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/download_warc.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/exact_deduplicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/exact_deduplicates.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/extract_text/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/extract_text/requirements.txt -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/finalise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/finalise.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/load_all_seed_ids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/merge_seed_shards.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/merge_seed_shards.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/preprocess_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/preprocess_dataset.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/process_for_concatenation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/process_for_concatenation.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/redownload_warc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/redownload_warc.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | bs4 3 | datasets 4 | pyathena 5 | surt 6 | tldextract 7 | warcio 8 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/shard_and_compress.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/shard_and_compress.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/shard_by_seed_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/python_scripts/shard_by_seed_id.py -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/.gitignore -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/DEPTH.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/DEPTH.md -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/README.md -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/check_errors_in_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/check_errors_in_dataset.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards_1000.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards_1000.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_too_big.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_too_big.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_4.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_4.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_5.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_5.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/extract_text_and_html_metadata.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/extract_text_and_html_metadata.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/merge_seed_shards.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/merge_seed_shards.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/preprocess_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/preprocess_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/redownload_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/redownload_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_and_compress.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_and_compress.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_by_seed_id.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_by_seed_id.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/README.md -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/candidate_websites_for_crawling.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/candidate_websites_for_crawling.csv -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/cc-metrics.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/cc-metrics.csv -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/cc-metrics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/cc-metrics.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/cleanup-seeds.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/cleanup-seeds.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/filtered_catalogue.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/filtered_catalogue.json -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/preprocess_dataset.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/preprocess_dataset.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/seeds.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/seeds.csv -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/test_preprcessing_via_pyarrow_pandas.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/test_preprcessing_via_pyarrow_pandas.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1_2/00_clean_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1_2/00_clean_dataset.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1_2/01_exact_deduplicates.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_1_2/01_exact_deduplicates.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/.gitignore -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/README.md -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/01_download_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/01_download_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02_redownload_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02_redownload_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02b_redownload_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02b_redownload_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/03_check_errors_in_dataset.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/03_check_errors_in_dataset.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/04_divide_in_subshards.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/04_divide_in_subshards.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/05_preprocess_warc.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/05_preprocess_warc.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/06_extract_text_and_html_metadata.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/06_extract_text_and_html_metadata.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/07_shard_by_seed_id.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/07_shard_by_seed_id.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/08_merge_seed_shards.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/08_merge_seed_shards.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/09_shard_and_compress.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/09_shard_and_compress.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/10_push_to_hub.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/10_push_to_hub.slurm -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/cleanup-seeds.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/cleanup-seeds.ipynb -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.csv -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.json -------------------------------------------------------------------------------- /index_search/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/README.md -------------------------------------------------------------------------------- /index_search/datasets_ES_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/datasets_ES_builder.py -------------------------------------------------------------------------------- /index_search/datasets_ES_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/datasets_ES_index.py -------------------------------------------------------------------------------- /index_search/datasets_ES_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/datasets_ES_search.py -------------------------------------------------------------------------------- /index_search/datasets_remote_ES_IBMcloud.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/datasets_remote_ES_IBMcloud.py -------------------------------------------------------------------------------- /index_search/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/docker-compose.yml -------------------------------------------------------------------------------- /index_search/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/index_search/requirements.txt -------------------------------------------------------------------------------- /kenlm_training/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/.gitignore -------------------------------------------------------------------------------- /kenlm_training/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/LICENSE -------------------------------------------------------------------------------- /kenlm_training/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/Makefile -------------------------------------------------------------------------------- /kenlm_training/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/README.md -------------------------------------------------------------------------------- /kenlm_training/cc_net/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/__init__.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/__main__.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/data/cutoff.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/data/cutoff.csv -------------------------------------------------------------------------------- /kenlm_training/cc_net/data/test_stats.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/data/test_stats.json -------------------------------------------------------------------------------- /kenlm_training/cc_net/dedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/dedup.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/execution.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/flat_hash_set.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/flat_hash_set.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/get_hf_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/get_hf_dataset.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/get_wiki_cirrus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/get_wiki_cirrus.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/jsonql.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/jsonql.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/mine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/mine.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/minify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/minify.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/perplexity.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/process_wet_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/process_wet_file.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/regroup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/regroup.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/split_by_lang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/split_by_lang.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/text_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/text_normalizer.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/tokenizer.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kenlm_training/cc_net/tools/dl_cc_100.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/tools/dl_cc_100.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/tools/expand_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/tools/expand_corpus.py -------------------------------------------------------------------------------- /kenlm_training/cc_net/tools/make_dmoz_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/cc_net/tools/make_dmoz_corpus.py -------------------------------------------------------------------------------- /kenlm_training/config/lid_exp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/config/lid_exp.json -------------------------------------------------------------------------------- /kenlm_training/config/mine_segment.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/config/mine_segment.json -------------------------------------------------------------------------------- /kenlm_training/config/test_reproduce.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/config/test_reproduce.json -------------------------------------------------------------------------------- /kenlm_training/config/test_segment.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/config/test_segment.json -------------------------------------------------------------------------------- /kenlm_training/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/pyproject.toml -------------------------------------------------------------------------------- /kenlm_training/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/setup.py -------------------------------------------------------------------------------- /kenlm_training/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/__init__.py -------------------------------------------------------------------------------- /kenlm_training/tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/conftest.py -------------------------------------------------------------------------------- /kenlm_training/tests/data/sample.warc.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/data/sample.warc.txt -------------------------------------------------------------------------------- /kenlm_training/tests/test_dedup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_dedup.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_flat_hash_set.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_flat_hash_set.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_jsonql.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_jsonql.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_minify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_minify.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_normalizer.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_parse_wet_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_parse_wet_file.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_regroup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_regroup.py -------------------------------------------------------------------------------- /kenlm_training/tests/test_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/tests/test_transformer.py -------------------------------------------------------------------------------- /kenlm_training/train_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/kenlm_training/train_all.sh -------------------------------------------------------------------------------- /perplexity_lenses/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/README.md -------------------------------------------------------------------------------- /perplexity_lenses/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/app.py -------------------------------------------------------------------------------- /perplexity_lenses/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/cli.py -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | REGISTRY_DATASET = "mhtoin/register_oscar" 3 | -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/perplexity_lenses/data.py -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/perplexity_lenses/engine.py -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/perplexity_lenses/perplexity.py -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/perplexity_lenses/visualization.py -------------------------------------------------------------------------------- /perplexity_lenses/poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/poetry.lock -------------------------------------------------------------------------------- /perplexity_lenses/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/pyproject.toml -------------------------------------------------------------------------------- /perplexity_lenses/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/requirements.txt -------------------------------------------------------------------------------- /perplexity_lenses/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /perplexity_lenses/tests/test_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/perplexity_lenses/tests/test_data.py -------------------------------------------------------------------------------- /pii-manager/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/.gitignore -------------------------------------------------------------------------------- /pii-manager/CHANGES.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/CHANGES.md -------------------------------------------------------------------------------- /pii-manager/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/LICENSE -------------------------------------------------------------------------------- /pii-manager/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /pii-manager/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/Makefile -------------------------------------------------------------------------------- /pii-manager/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/README.md -------------------------------------------------------------------------------- /pii-manager/doc/contributing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/doc/contributing.md -------------------------------------------------------------------------------- /pii-manager/doc/external.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/doc/external.md -------------------------------------------------------------------------------- /pii-manager/doc/tasks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/doc/tasks.md -------------------------------------------------------------------------------- /pii-manager/doc/usage.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/doc/usage.md -------------------------------------------------------------------------------- /pii-manager/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/requirements.txt -------------------------------------------------------------------------------- /pii-manager/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/setup.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/api/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/api/file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/api/file.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/api/manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/api/manager.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/app/manage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/app/manage.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/app/task_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/app/task_info.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/base.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/context.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/exception.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/exception.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/json.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/normalizer.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/taskdict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/taskdict.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/helper/types.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/bitcoin_address.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/any/bitcoin_address.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/credit_card.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/any/credit_card.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/email.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/any/email.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/ip_address.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/any/ip_address.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/any/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/any/international_phone_number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/au/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/au/abn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/en/au/abn.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/au/tfn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/en/au/tfn.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/ca/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/in_/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/in_/aadhaar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/us/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/us/social_security_number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/en/us/social_security_number.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/any/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/any/international_phone_number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/es/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/es/bank_account.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/es/es/bank_account.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/es/govid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/es/es/govid.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/mx/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/mx/curp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/es/mx/curp.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/fr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/fr/ca/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/pt/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/br/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/br/cpf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/pt/br/cpf.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/pt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/pt/govid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/pt/pt/govid.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/cn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/cn/gov_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/cn/misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/lang/zh/cn/misc.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/piientity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/piientity.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/piienum.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/src/pii_manager/piienum.py -------------------------------------------------------------------------------- /pii-manager/test/data/extract-block.ndjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/extract-block.ndjson -------------------------------------------------------------------------------- /pii-manager/test/data/extract-line.ndjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/extract-line.ndjson -------------------------------------------------------------------------------- /pii-manager/test/data/extract-sentence.ndjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/extract-sentence.ndjson -------------------------------------------------------------------------------- /pii-manager/test/data/full-block.ndjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/full-block.ndjson -------------------------------------------------------------------------------- /pii-manager/test/data/full-line.ndjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/full-line.ndjson -------------------------------------------------------------------------------- /pii-manager/test/data/full-sentence.ndjson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/full-sentence.ndjson -------------------------------------------------------------------------------- /pii-manager/test/data/orig.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/orig.txt -------------------------------------------------------------------------------- /pii-manager/test/data/replace.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/replace.txt -------------------------------------------------------------------------------- /pii-manager/test/data/tag.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/tag.txt -------------------------------------------------------------------------------- /pii-manager/test/data/taskfile-error.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/taskfile-error.json -------------------------------------------------------------------------------- /pii-manager/test/data/taskfile.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/data/taskfile.json -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/api/test_file.py -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_file_taskfile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/api/test_file_taskfile.py -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/api/test_manager.py -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_manager_add.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/api/test_manager_add.py -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_manager_ctx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/api/test_manager_ctx.py -------------------------------------------------------------------------------- /pii-manager/test/unit/helper/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/helper/test_base.py -------------------------------------------------------------------------------- /pii-manager/test/unit/helper/test_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/helper/test_context.py -------------------------------------------------------------------------------- /pii-manager/test/unit/helper/test_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/helper/test_norm.py -------------------------------------------------------------------------------- /pii-manager/test/unit/helper/test_taskdict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/helper/test_taskdict.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_bitcoin_address.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/any/test_bitcoin_address.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_credit_card.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/any/test_credit_card.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_email.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/any/test_email.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_ip_address.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/any/test_ip_address.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/any/test_ipn_en.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/en/any/test_ipn_en.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/au/test_abn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/en/au/test_abn.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/au/test_tfn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/en/au/test_tfn.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/ca/test_sin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/en/ca/test_sin.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/in_/test_aadhaar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/en/in_/test_aadhaar.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/us/test_ssn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/en/us/test_ssn.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/any/test_ipn_es.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/es/any/test_ipn_es.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/es/test_bank_account.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/es/es/test_bank_account.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/es/test_govid_es_es.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/es/es/test_govid_es_es.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/mx/test_govid_es_mx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/es/mx/test_govid_es_mx.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/pt/br/test_govid_pt_br.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/pt/br/test_govid_pt_br.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/pt/pt/test_govid_pt_pt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/pt/pt/test_govid_pt_pt.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/zh/cn/test_govid_zh_cn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/zh/cn/test_govid_zh_cn.py -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/zh/cn/test_misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pii-manager/test/unit/lang/zh/cn/test_misc.py -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/poetry.lock -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/requirements.txt -------------------------------------------------------------------------------- /tokenizer/python_script/dedup_exact_article.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/python_script/dedup_exact_article.py -------------------------------------------------------------------------------- /tokenizer/python_script/dedup_lines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/python_script/dedup_lines.py -------------------------------------------------------------------------------- /tokenizer/python_script/ram_dedup_lines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/python_script/ram_dedup_lines.py -------------------------------------------------------------------------------- /tokenizer/python_script/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets>=1.18.0 2 | pyarrow>=6.0.0 3 | -------------------------------------------------------------------------------- /tokenizer/scripts/01_remove_deplicated_lines.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/scripts/01_remove_deplicated_lines.sh -------------------------------------------------------------------------------- /tokenizer/scripts/02_remove_duplicated_lines_dataset_with_dataset_source.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/scripts/02_remove_duplicated_lines_dataset_with_dataset_source.sh -------------------------------------------------------------------------------- /tokenizer/scripts/03_remove_duplicated_lines_alpha.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/scripts/03_remove_duplicated_lines_alpha.sh -------------------------------------------------------------------------------- /tokenizer/scripts/04_remove_duplicated_lines_alpha _memory.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/scripts/04_remove_duplicated_lines_alpha _memory.sh -------------------------------------------------------------------------------- /tokenizer/scripts/05_remove_duplicated_lines_alpha __v2_memory.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/scripts/05_remove_duplicated_lines_alpha __v2_memory.sh -------------------------------------------------------------------------------- /tokenizer/scripts/06_dedup_exact_examples.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/HEAD/tokenizer/scripts/06_dedup_exact_examples.sh --------------------------------------------------------------------------------