├── .github └── workflows │ ├── add-issue-to-project.yml │ ├── label-with-contact-neede.yml │ ├── label-with-help-wanted.yml │ ├── pii-manager.yml │ └── self-assign.yaml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── __init__.py ├── ac_dc ├── README.md ├── anonymization.py ├── deduplicate │ ├── README.md │ ├── conf │ │ ├── self_deduplicate_ar.yaml │ │ ├── self_deduplicate_bn.yaml │ │ ├── self_deduplicate_ca.yaml │ │ ├── self_deduplicate_en.yaml │ │ ├── self_deduplicate_es.yaml │ │ ├── self_deduplicate_eu.yaml │ │ ├── self_deduplicate_fr.yaml │ │ ├── self_deduplicate_gl.yaml │ │ ├── self_deduplicate_hi.yaml │ │ ├── self_deduplicate_id.yaml │ │ ├── self_deduplicate_pt.yaml │ │ ├── self_deduplicate_ur.yaml │ │ ├── self_deduplicate_vi.yaml │ │ └── self_deduplicate_zh.yaml │ ├── deduplicate │ │ ├── __init__.py │ │ └── util.py │ ├── self_deduplicate.py │ └── visualize.ipynb ├── download_sentencepiece_kenlm_models.py ├── explanation_filtering_pipeline.pdf ├── filtering.py ├── flagged_words.py ├── languages_id.py ├── main_filtering.py ├── normalization.py ├── parameters_filtering.py ├── person_and_id_anonymization.py ├── stopwords.py ├── test_anonymization.py └── visualization │ ├── README.md │ ├── get_data_for_visualization.py │ └── visualization.py ├── bertin ├── README.md ├── config.json ├── config.py ├── configs │ ├── base │ │ ├── config.json │ │ └── tokenizer.json │ └── large │ │ ├── config.json │ │ └── tokenizer.json ├── convert.py ├── evaluation │ ├── paws.yaml │ ├── run_glue.py │ ├── run_ner.ipynb │ ├── run_ner.py │ ├── token.yaml │ └── xnli.yaml ├── events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2 ├── events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2 ├── events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2 ├── get_embeddings_and_perplexity.py ├── images │ ├── bertin-tilt.png │ ├── bertin.png │ ├── ccnet.png │ ├── datasets-perp-20-120.png │ ├── datasets-perp.png │ ├── datasets-random-comparison.png │ ├── datasets-wsize.png │ ├── perp-p95.png │ ├── perp-resample-gaussian.png │ ├── perp-resample-stepwise.png │ ├── perplexity_colored_embeddings.html │ └── random_512.jpg ├── mc4 │ ├── README.md │ ├── dummy │ │ └── af │ │ │ └── 0.0.0 │ │ │ └── dummy_data.zip │ └── mc4.py ├── merges.txt ├── perplexity.py ├── run.sh ├── run_mlm_flax.py ├── run_mlm_flax_stream.py ├── run_stream.sh ├── special_tokens_map.json ├── tokenizer.json ├── tokenizer_config.json ├── tokens.py ├── tokens.py.orig ├── tsne_plot.py ├── utils │ ├── dataset_perplexity.py │ ├── download_mc4es_sampled.py │ └── generate_datasets.py └── vocab.json ├── cc_pseudo_crawl ├── get_stats.py ├── language_annotation │ ├── python_scripts │ │ ├── annotate_langid_crawl.py │ │ ├── check_wrong_files.py │ │ ├── compute_stats_langid.py │ │ └── detect_html_lang_attrib.py │ └── slurm_scripts │ │ ├── 02_detect_html_lang_attrib.slurm │ │ └── job_annotate_langid_crawl.sh ├── processing_notebooks │ ├── NigerCongoDS.ipynb │ └── pseudocrawl_nigercongo.ipynb ├── python_scripts │ ├── cc_lookup_next.py │ ├── cc_lookup_seed.py │ ├── check_erros_in_dataset.py │ ├── deeper.py │ ├── divide_in_shards.py │ ├── download_warc.py │ ├── exact_deduplicates.py │ ├── extract_text │ │ ├── extract_text_and_html_metadata.py │ │ └── requirements.txt │ ├── finalise.py │ ├── load_all_seed_ids.py │ ├── merge_seed_shards.py │ ├── preprocess_dataset.py │ ├── process_for_concatenation.py │ ├── pseudo_crawl_seed_to_lm_dset.py │ ├── pseudo_crawl_seed_to_lm_dset_v2.py │ ├── redownload_warc.py │ ├── requirements.txt │ ├── shard_and_compress.py │ └── shard_by_seed_id.py ├── seeds_batch_1 │ ├── .gitignore │ ├── DEPTH.md │ ├── README.md │ ├── slurm_scripts │ │ ├── check_errors_in_dataset.slurm │ │ ├── divide_in_subshards.slurm │ │ ├── divide_in_subshards_1000.slurm │ │ ├── download_warc.slurm │ │ ├── download_warc_too_big.slurm │ │ ├── download_warc_trial_4.slurm │ │ ├── download_warc_trial_5.slurm │ │ ├── extract_text_and_html_metadata.slurm │ │ ├── merge_seed_shards.slurm │ │ ├── preprocess_warc.slurm │ │ ├── redownload_warc.slurm │ │ ├── shard_and_compress.slurm │ │ └── shard_by_seed_id.slurm │ └── sourcing_sheet_seeds │ │ ├── README.md │ │ ├── candidate_websites_for_crawling.csv │ │ ├── cc-metrics.csv │ │ ├── cc-metrics.ipynb │ │ ├── cleanup-seeds.ipynb │ │ ├── filtered_catalogue.json │ │ ├── preprocess_dataset.ipynb │ │ ├── seeds.csv │ │ └── test_preprcessing_via_pyarrow_pandas.ipynb ├── seeds_batch_1_2 │ ├── 00_clean_dataset.slurm │ └── 01_exact_deduplicates.slurm └── seeds_batch_2 │ ├── .gitignore │ ├── README.md │ ├── slurm_scripts │ ├── 01_download_warc.slurm │ ├── 02_redownload_warc.slurm │ ├── 02b_redownload_warc.slurm │ ├── 03_check_errors_in_dataset.slurm │ ├── 04_divide_in_subshards.slurm │ ├── 05_preprocess_warc.slurm │ ├── 06_extract_text_and_html_metadata.slurm │ ├── 07_shard_by_seed_id.slurm │ ├── 08_merge_seed_shards.slurm │ ├── 09_shard_and_compress.slurm │ └── 10_push_to_hub.slurm │ └── sourcing_sheet_seeds │ ├── cleanup-seeds.ipynb │ ├── seeds.csv │ ├── seeds_batch_2.csv │ └── seeds_batch_2.json ├── index_search ├── README.md ├── datasets_ES_builder.py ├── datasets_ES_index.py ├── datasets_ES_search.py ├── datasets_remote_ES_IBMcloud.py ├── docker-compose.yml └── requirements.txt ├── kenlm_training ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── cc_net │ ├── __init__.py │ ├── __main__.py │ ├── data │ │ ├── cutoff.csv │ │ └── test_stats.json │ ├── dedup.py │ ├── execution.py │ ├── flat_hash_set.py │ ├── get_hf_dataset.py │ ├── get_wiki_cirrus.py │ ├── jsonql.py │ ├── mine.py │ ├── minify.py │ ├── perplexity.py │ ├── process_wet_file.py │ ├── regroup.py │ ├── split_by_lang.py │ ├── text_normalizer.py │ ├── tokenizer.py │ └── tools │ │ ├── __init__.py │ │ ├── dl_cc_100.py │ │ ├── expand_corpus.py │ │ └── make_dmoz_corpus.py ├── config │ ├── lid_exp.json │ ├── mine_segment.json │ ├── test_reproduce.json │ └── test_segment.json ├── pyproject.toml ├── setup.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── data │ │ └── sample.warc.txt │ ├── test_dedup.py │ ├── test_flat_hash_set.py │ ├── test_jsonql.py │ ├── test_minify.py │ ├── test_normalizer.py │ ├── test_parse_wet_file.py │ ├── test_regroup.py │ └── test_transformer.py └── train_all.sh ├── perplexity_lenses ├── README.md ├── app.py ├── cli.py ├── perplexity_lenses │ ├── __init__.py │ ├── data.py │ ├── engine.py │ ├── perplexity.py │ └── visualization.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tests │ ├── __init__.py │ └── test_data.py ├── pii-manager ├── .gitignore ├── CHANGES.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── doc │ ├── contributing.md │ ├── external.md │ ├── tasks.md │ └── usage.md ├── requirements.txt ├── setup.py ├── src │ └── pii_manager │ │ ├── __init__.py │ │ ├── api │ │ ├── __init__.py │ │ ├── file.py │ │ └── manager.py │ │ ├── app │ │ ├── __init__.py │ │ ├── manage.py │ │ └── task_info.py │ │ ├── helper │ │ ├── __init__.py │ │ ├── base.py │ │ ├── context.py │ │ ├── exception.py │ │ ├── json.py │ │ ├── normalizer.py │ │ ├── taskdict.py │ │ └── types.py │ │ ├── lang │ │ ├── __init__.py │ │ ├── any │ │ │ ├── __init__.py │ │ │ ├── bitcoin_address.py │ │ │ ├── credit_card.py │ │ │ ├── email.py │ │ │ └── ip_address.py │ │ ├── en │ │ │ ├── __init__.py │ │ │ ├── any │ │ │ │ ├── __init__.py │ │ │ │ └── international_phone_number.py │ │ │ ├── au │ │ │ │ ├── __init__.py │ │ │ │ ├── abn.py │ │ │ │ └── tfn.py │ │ │ ├── ca │ │ │ │ ├── __init__.py │ │ │ │ └── social_insurance_number.py │ │ │ ├── in_ │ │ │ │ ├── __init__.py │ │ │ │ └── aadhaar.py │ │ │ └── us │ │ │ │ ├── __init__.py │ │ │ │ └── social_security_number.py │ │ ├── es │ │ │ ├── __init__.py │ │ │ ├── any │ │ │ │ ├── __init__.py │ │ │ │ └── international_phone_number.py │ │ │ ├── es │ │ │ │ ├── __init__.py │ │ │ │ ├── bank_account.py │ │ │ │ └── govid.py │ │ │ └── mx │ │ │ │ ├── __init__.py │ │ │ │ └── curp.py │ │ ├── fr │ │ │ ├── __init__.py │ │ │ └── ca │ │ │ │ ├── __init__.py │ │ │ │ └── social_insurance_number.py │ │ ├── pt │ │ │ ├── __init__.py │ │ │ ├── br │ │ │ │ ├── __init__.py │ │ │ │ └── cpf.py │ │ │ └── pt │ │ │ │ ├── __init__.py │ │ │ │ └── govid.py │ │ └── zh │ │ │ ├── __init__.py │ │ │ └── cn │ │ │ ├── __init__.py │ │ │ ├── gov_id.py │ │ │ └── misc.py │ │ ├── piientity.py │ │ └── piienum.py └── test │ ├── data │ ├── extract-block.ndjson │ ├── extract-line.ndjson │ ├── extract-sentence.ndjson │ ├── full-block.ndjson │ ├── full-line.ndjson │ ├── full-sentence.ndjson │ ├── orig.txt │ ├── replace.txt │ ├── tag.txt │ ├── taskfile-error.json │ └── taskfile.json │ └── unit │ ├── api │ ├── test_file.py │ ├── test_file_taskfile.py │ ├── test_manager.py │ ├── test_manager_add.py │ └── test_manager_ctx.py │ ├── helper │ ├── test_base.py │ ├── test_context.py │ ├── test_norm.py │ └── test_taskdict.py │ └── lang │ ├── any │ ├── test_bitcoin_address.py │ ├── test_credit_card.py │ ├── test_email.py │ └── test_ip_address.py │ ├── en │ ├── any │ │ └── test_ipn_en.py │ ├── au │ │ ├── test_abn.py │ │ └── test_tfn.py │ ├── ca │ │ └── test_sin.py │ ├── in_ │ │ └── test_aadhaar.py │ └── us │ │ └── test_ssn.py │ ├── es │ ├── any │ │ └── test_ipn_es.py │ ├── es │ │ ├── test_bank_account.py │ │ └── test_govid_es_es.py │ └── mx │ │ └── test_govid_es_mx.py │ ├── pt │ ├── br │ │ └── test_govid_pt_br.py │ └── pt │ │ └── test_govid_pt_pt.py │ └── zh │ └── cn │ ├── test_govid_zh_cn.py │ └── test_misc.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tokenizer ├── python_script ├── dedup_exact_article.py ├── dedup_lines.py ├── ram_dedup_lines.py └── requirements.txt └── scripts ├── 01_remove_deplicated_lines.sh ├── 02_remove_duplicated_lines_dataset_with_dataset_source.sh ├── 03_remove_duplicated_lines_alpha.sh ├── 04_remove_duplicated_lines_alpha _memory.sh ├── 05_remove_duplicated_lines_alpha __v2_memory.sh └── 06_dedup_exact_examples.sh /.github/workflows/label-with-contact-neede.yml: -------------------------------------------------------------------------------- 1 | name: Label with contact needed 2 | on: 3 | issue_comment: 4 | types: created 5 | jobs: 6 | one: 7 | runs-on: ubuntu-latest 8 | if: >- 9 | (github.event.comment.body == '#contact' || 10 | github.event.comment.body == '#contact-needed') 11 | steps: 12 | - run: | 13 | echo "Labeling issue ${{ github.event.issue.number }} with 'contact needed'" 14 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["contact needed"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels 15 | -------------------------------------------------------------------------------- /.github/workflows/label-with-help-wanted.yml: -------------------------------------------------------------------------------- 1 | name: Label with help wanted 2 | on: 3 | issue_comment: 4 | types: created 5 | jobs: 6 | one: 7 | runs-on: ubuntu-latest 8 | if: >- 9 | (github.event.comment.body == '#help' || 10 | github.event.comment.body == '#help-wanted') 11 | steps: 12 | - run: | 13 | echo "Labeling issue ${{ github.event.issue.number }} with 'help wanted'" 14 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["help wanted"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels 15 | -------------------------------------------------------------------------------- /.github/workflows/pii-manager.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | branches: 4 | - master 5 | paths: 6 | - 'pii-manager/src/**' 7 | - 'pii-manager/test/**' 8 | - 'pii-manager/setup.py' 9 | - 'pii-manager/Makefile' 10 | - 'pii-manager/requirements.txt' 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | max-parallel: 4 17 | matrix: 18 | python-version: [3.8] 19 | 20 | steps: 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Checkout main repository 26 | uses: actions/checkout@v2 27 | - name: Create venv 28 | run: | 29 | cd pii-manager 30 | VENV="$GITHUB_WORKSPACE/venv" make venv 31 | - name: Install package 32 | run: | 33 | cd pii-manager 34 | VENV="$GITHUB_WORKSPACE/venv" make install 35 | - name: Test with pytest 36 | run: | 37 | cd pii-manager 38 | VENV="$GITHUB_WORKSPACE/venv" make unit-verbose 39 | -------------------------------------------------------------------------------- /.github/workflows/self-assign.yaml: -------------------------------------------------------------------------------- 1 | name: Self-assign 2 | on: 3 | issue_comment: 4 | types: created 5 | jobs: 6 | one: 7 | runs-on: ubuntu-latest 8 | if: >- 9 | (github.event.comment.body == '#take' || 10 | github.event.comment.body == '#self-assign') 11 | steps: 12 | - run: | 13 | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" 14 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees 15 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ac_dc/muliwai"] 2 | path = ac_dc/muliwai 3 | url = https://github.com/ontocord/muliwai 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # To use: 2 | # 3 | # pre-commit run -a 4 | # 5 | # Or: 6 | # 7 | # pre-commit install # (runs every time you commit in git) 8 | # 9 | # To update this file: 10 | # 11 | # pre-commit autoupdate 12 | # 13 | # See https://github.com/pre-commit/pre-commit 14 | 15 | repos: 16 | # Standard hooks 17 | - repo: https://github.com/pre-commit/pre-commit-hooks 18 | rev: v4.2.0 19 | hooks: 20 | - id: check-added-large-files 21 | - id: check-case-conflict 22 | - id: check-docstring-first 23 | exclude: ^pii_processing/ 24 | - id: check-merge-conflict 25 | - id: check-symlinks 26 | - id: check-toml 27 | - id: check-yaml 28 | - id: debug-statements 29 | exclude: ^pii_processing/ 30 | - id: end-of-file-fixer 31 | exclude: ^pii_processing/ 32 | - id: mixed-line-ending 33 | - id: requirements-txt-fixer 34 | - id: trailing-whitespace 35 | exclude: ^pii_processing/ 36 | 37 | - repo: https://github.com/asottile/pyupgrade 38 | rev: v2.32.1 39 | hooks: 40 | - id: pyupgrade 41 | exclude: ^pii_processing/ 42 | 43 | #- repo: https://github.com/PyCQA/isort 44 | # rev: 5.10.0 45 | # hooks: 46 | # - id: isort 47 | 48 | # Black, the code formatter, natively supports pre-commit 49 | - repo: https://github.com/psf/black 50 | rev: 22.3.0 # Keep in sync with blacken-docs 51 | hooks: 52 | - id: black 53 | exclude: ^pii_processing/ 54 | 55 | # Changes tabs to spaces 56 | - repo: https://github.com/Lucas-C/pre-commit-hooks 57 | rev: v1.1.14 58 | hooks: 59 | - id: remove-tabs 60 | exclude: ^(pii_processing|.*Makefile) 61 | 62 | - repo: https://github.com/shellcheck-py/shellcheck-py 63 | rev: v0.8.0.4 64 | hooks: 65 | - id: shellcheck 66 | exclude: ^(pii_processing/|cc_pseudo_crawl) 67 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: init 2 | init: 3 | poetry install --extras "torch" 4 | pre-commit install 5 | 6 | .PHONY: format 7 | format: 8 | pre-commit run -a 9 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/__init__.py -------------------------------------------------------------------------------- /ac_dc/anonymization.py: -------------------------------------------------------------------------------- 1 | from muliwai.pii_regexes import detect_ner_with_regex_and_context 2 | from muliwai.pii_regexes import regex_rulebase 3 | 4 | trannum = str.maketrans("0123456789", "1111111111") 5 | 6 | 7 | def apply_regex_anonymization( 8 | sentence: str, 9 | lang_id: str, 10 | context_window: int = 20, 11 | anonymize_condition=None, 12 | tag_type={"IP_ADDRESS", "KEY", "ID", "PHONE", "USER", "EMAIL", "LICENSE_PLATE"}, 13 | ) -> str: 14 | """ 15 | Params: 16 | ================== 17 | sentence: str, the sentence to be anonymized 18 | lang_id: str, the language id of the sentence 19 | context_window: int, the context window size 20 | anonymize_condition: function, the anonymization condition 21 | tag_type: iterable, the tag types of the anonymization. All keys in regex_rulebase is None 22 | """ 23 | if tag_type == None: 24 | tag_type = regex_rulebase.keys() 25 | lang_id = lang_id.split("_")[0] 26 | ner = detect_ner_with_regex_and_context( 27 | sentence=sentence, 28 | src_lang=lang_id, 29 | context_window=context_window, 30 | tag_type=tag_type, 31 | ) 32 | if anonymize_condition: 33 | for (ent, start, end, tag) in ner: 34 | # we need to actually walk through and replace by start, end span. 35 | sentence = sentence.replace(ent, f" <{tag}> ") 36 | return sentence, ner 37 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_ar.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/ar" 17 | gcs: null 18 | cache: "outputs/ar_cache" 19 | output: "outputs/ar" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_bn.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/bn" 17 | gcs: null 18 | cache: "outputs/bn_cache" 19 | output: "outputs/bn" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_ca.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/ca" 17 | gcs: null 18 | cache: "outputs/ca_cache" 19 | output: "outputs/ca" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_en.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "space" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/en" 17 | gcs: null 18 | cache: "outputs/en_cache" 19 | output: "outputs/en" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_es.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/es" 17 | gcs: null 18 | cache: "outputs/es_cache" 19 | output: "outputs/es" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_eu.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/eu" 17 | gcs: null 18 | cache: "outputs/eu_cache" 19 | output: "outputs/eu" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_fr.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/fr" 17 | gcs: null 18 | cache: "outputs/fr_cache" 19 | output: "outputs/fr" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_gl.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 4 # size of the token window 3 | hamming_distance: 7 # similarity threshold out of 64 bits 4 | num_blocks: 8 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 80 # number of processes to run when hashing 10 | load_dataset: 11 | path: "oscar-corpus/OSCAR-2109" 12 | name: "deduplicated_gl" 13 | split: "train" 14 | use_auth_token: true 15 | load_from_disk: 16 | path: null 17 | gcs: null 18 | cache: "outputs/gl_cache" 19 | output: "outputs/gl" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_hi.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/hi" 17 | gcs: null 18 | cache: "outputs/hi_cache" 19 | output: "outputs/hi" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_id.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/id" 17 | gcs: null 18 | cache: "outputs/id_cache" 19 | output: "outputs/id" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_pt.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/pt" 17 | gcs: null 18 | cache: "outputs/pt_cache" 19 | output: "outputs/pt" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_ur.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/ur" 17 | gcs: null 18 | cache: "outputs/ur_cache" 19 | output: "outputs/ur" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_vi.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/vi" 17 | gcs: null 18 | cache: "outputs/vi_cache" 19 | output: "outputs/vi" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/conf/self_deduplicate_zh.yaml: -------------------------------------------------------------------------------- 1 | tokenization: "character" # character, punctuation or space 2 | window_size: 6 # size of the token window, average arabic word length is 5 3 | hamming_distance: 4 # similarity threshold out of 64 bits 4 | num_blocks: 6 # must be larger than the hamming_distance 5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization 6 | lowercase: true # lowercase the text when hashing 7 | text_column: "text" # column name for the text to be hashed 8 | index_column: "id" # column name for the index 9 | num_proc: 96 # number of processes to run when hashing 10 | load_dataset: 11 | path: null 12 | name: null 13 | split: null 14 | use_auth_token: false 15 | load_from_disk: 16 | path: "data/oscar_filtered_final/zh" 17 | gcs: null 18 | cache: "outputs/zh_cache" 19 | output: "outputs/zh" 20 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/deduplicate/__init__.py: -------------------------------------------------------------------------------- 1 | import regex as re 2 | 3 | PUNCTUATION_REGEX = re.compile(r"\p{P}") 4 | INTERNAL_HASH = "__dedup_hash__" 5 | -------------------------------------------------------------------------------- /ac_dc/deduplicate/deduplicate/util.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | import simhash 5 | 6 | from . import INTERNAL_HASH, PUNCTUATION_REGEX 7 | 8 | 9 | def hashing( 10 | record, 11 | column: str = "text", 12 | tokenization: str = "character", 13 | window_size: int = 4, 14 | ignore_punctuation: bool = True, 15 | lowercase: bool = True, 16 | output: str = INTERNAL_HASH, 17 | ) -> Dict[str, int]: 18 | """Hashing a document with SimHash. 19 | 20 | Parameters 21 | ---------- 22 | record : [type] 23 | A dict of feature-value pairs 24 | column : str, optional 25 | The column name to use for hashing, by default "text" 26 | tokenization : str, optional 27 | Method to use for tokenization, by default "character" 28 | window_size : int, optional 29 | The size of the token window, by default 4 30 | ignore_punctuation : bool, optional 31 | To ignore punctuation or not, by default True 32 | lowercase : bool, optional 33 | To lowercase the text or not, by default True 34 | 35 | Returns 36 | ------- 37 | Dict[str, int] 38 | The new hash feature column 39 | 40 | Raises 41 | ------ 42 | Exception 43 | Unrecognized tokenization parameter 44 | """ 45 | document = record[column] 46 | if lowercase: 47 | document = document.lower() 48 | 49 | if ignore_punctuation: 50 | document = PUNCTUATION_REGEX.sub("", document) 51 | 52 | if tokenization == "character": 53 | tokens = [ 54 | str.encode(document[i : i + window_size]) 55 | for i in range(len(document) - window_size) 56 | ] 57 | elif tokenization == "punctuation": 58 | tokens = PUNCTUATION_REGEX.split(document) 59 | tokens = [ 60 | str.encode(" ".join(tokens[i : i + window_size])) 61 | for i in range(len(tokens) - window_size) 62 | ] 63 | elif tokenization == "space": 64 | tokens = document.split(" ") 65 | tokens = [ 66 | str.encode(" ".join(tokens[i : i + window_size])) 67 | for i in range(len(tokens) - window_size) 68 | ] 69 | else: 70 | raise Exception(f"Unrecognized tokenization parameter {tokenization}") 71 | 72 | return {output: np.uint64(simhash.compute(map(simhash.unsigned_hash, tokens)))} 73 | -------------------------------------------------------------------------------- /ac_dc/download_sentencepiece_kenlm_models.py: -------------------------------------------------------------------------------- 1 | """Download Sentencepiece and KenLM models for supported languages. 2 | 3 | Usage: 4 | python download_sentencepiece_kenlm_models.py --output_dir_path /tmp/ 5 | 6 | All Sentencepiece and KenLM language models will be saved under /tmp. 7 | """ 8 | 9 | import argparse 10 | import subprocess 11 | 12 | from languages_id import langs_id 13 | 14 | 15 | def download_sentencepiece_kenlm_models(output_dir_path: str) -> None: 16 | supported_sentencepiece_langs = langs_id["sentencepiece_id"].dropna().unique() 17 | for lang in supported_sentencepiece_langs: 18 | try: 19 | output_sentencepiece = subprocess.check_output( 20 | f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.sp.model -P {output_dir_path}", # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model for FB models 21 | shell=True, 22 | ) 23 | except: 24 | print( 25 | f"Warning: Download failed for Sentencepiece model for language {lang}." 26 | ) 27 | 28 | supported_kenlm_langs = langs_id["kenlm_id"].dropna().unique() 29 | for lang in supported_kenlm_langs: 30 | try: 31 | output_kenlm = subprocess.check_output( 32 | f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.arpa.bin -P {output_dir_path}", # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin for FB models 33 | shell=True, 34 | ) 35 | except: 36 | print(f"Warning: Download failed for KenLM model for language {lang}.") 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser( 41 | description="Download Sentencepiece and KenLM models for supported languages." 42 | ) 43 | parser.add_argument( 44 | "--output_dir_path", 45 | type=str, 46 | default="/tmp/", 47 | help="Output directory path to save models.", 48 | ) 49 | args = parser.parse_args() 50 | 51 | download_sentencepiece_kenlm_models(output_dir_path=args.output_dir_path) 52 | -------------------------------------------------------------------------------- /ac_dc/explanation_filtering_pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/ac_dc/explanation_filtering_pipeline.pdf -------------------------------------------------------------------------------- /ac_dc/normalization.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict 3 | 4 | 5 | non_printing_characters_re = re.compile( 6 | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" 7 | ) 8 | 9 | digits_re: re.Pattern = re.compile(r"\d") 10 | 11 | unicode_punctuation: Dict[str, str] = { 12 | ",": ",", 13 | "。": ".", 14 | "、": ",", 15 | "„": '"', 16 | "”": '"', 17 | "“": '"', 18 | "«": '"', 19 | "»": '"', 20 | "1": '"', 21 | "」": '"', 22 | "「": '"', 23 | "《": '"', 24 | "》": '"', 25 | "´": "'", 26 | "∶": ":", 27 | ":": ":", 28 | "?": "?", 29 | "!": "!", 30 | "(": "(", 31 | ")": ")", 32 | ";": ";", 33 | "–": "-", 34 | "—": " - ", 35 | ".": ". ", 36 | "~": "~", 37 | "’": "'", 38 | "…": "...", 39 | "━": "-", 40 | "〈": "<", 41 | "〉": ">", 42 | "【": "[", 43 | "】": "]", 44 | "%": "%", 45 | "►": "-", 46 | } 47 | 48 | normalization = { 49 | "non_printing_characters_re": non_printing_characters_re, 50 | "digits_re": digits_re, 51 | "unicode_punctuation": unicode_punctuation, 52 | } 53 | -------------------------------------------------------------------------------- /ac_dc/person_and_id_anonymization.py: -------------------------------------------------------------------------------- 1 | from muliwai.regex_manager import detect_ner_with_regex_and_context 2 | from muliwai.pii_regexes_rulebase import regex_rulebase 3 | from muliwai.ner_manager import detect_ner_with_hf_model 4 | from muliwai.faker_manager import augment_anonymize 5 | 6 | 7 | def apply_anonymization( 8 | sentence: str, 9 | lang_id: str, 10 | context_window: int = 20, 11 | anonymize_condition=None, 12 | tag_type={ 13 | "IP_ADDRESS", 14 | "KEY", 15 | "ID", 16 | "PHONE", 17 | "USER", 18 | "EMAIL", 19 | "LICENSE_PLATE", 20 | "PERSON", 21 | }, 22 | device: str = "cpu", 23 | ) -> str: 24 | """ 25 | Params: 26 | ================== 27 | sentence: str, the sentence to be anonymized 28 | lang_id: str, the language id of the sentence 29 | context_window: int, the context window size 30 | anonymize_condition: function, the anonymization condition 31 | tag_type: iterable, the tag types of the anonymization. By default: {'IP_ADDRESS', 'KEY', 'ID', 'PHONE', 'USER', 'EMAIL', 'LICENSE_PLATE', 'PERSON'} 32 | device: cpu or cuda:{device_id} 33 | 34 | """ 35 | if tag_type == None: 36 | tag_type = regex_rulebase.keys() 37 | lang_id = lang_id.split("_")[0] 38 | ner_ids = detect_ner_with_regex_and_context( 39 | sentence=sentence, 40 | src_lang=lang_id, 41 | context_window=context_window, 42 | tag_type=tag_type, 43 | ) 44 | ner_persons = detect_ner_with_hf_model( 45 | sentence=sentence, 46 | src_lang=lang_id, 47 | device=device, 48 | ) 49 | ner = ner_ids + ner_persons 50 | if anonymize_condition: 51 | new_sentence, new_ner, _ = augment_anonymize( 52 | sentence, 53 | lang_id, 54 | ner, 55 | ) 56 | doc = { 57 | "text": new_sentence, 58 | "ner": new_ner, 59 | "orig_text": sentence, 60 | "orig_ner": ner, 61 | } 62 | else: 63 | new_sentence = sentence 64 | doc = {"text": new_sentence, "ner": ner} 65 | return new_sentence, doc 66 | -------------------------------------------------------------------------------- /ac_dc/test_anonymization.py: -------------------------------------------------------------------------------- 1 | import random 2 | from anonymization import apply_regex_anonymization 3 | from faker import Faker 4 | from num2words import num2words 5 | 6 | # We may need to include other test scenarios 7 | # Wherever possible, test with faker 8 | 9 | 10 | def main(): 11 | test_suite = {"English": test_en, "Chinese": test_zh} 12 | for language, test_func in test_suite.items(): 13 | print("Testing {}".format(language)) 14 | test_func() 15 | print("==================================================") 16 | 17 | 18 | def test_en(): 19 | fake = Faker("en_US") 20 | sentences = [ 21 | f"I am {num2words(random.randint(0,120))} years old, and she is {random.randint(0,120)} year-old", # Age 22 | f"Sherry lives at {fake.street_address()}", # Address 23 | f"My dad is a cancer fighter. Her grandma is suffering from alzheimer's", # Disease 24 | f"Let me tell you a secret, Mr. Nguyen's SSN is {fake.ssn() if random.choice([True, False]) else fake.ssn().replace('-', '')}.", # Government ID 25 | f"Dear Ian, the payment through {fake.credit_card_number()} has been successfully executed.", # Credit card 26 | ] 27 | for sentence in sentences: 28 | print( 29 | apply_regex_anonymization( 30 | sentence=sentence, lang_id="en", anonymize_condition=True 31 | ) 32 | ) 33 | 34 | 35 | def test_zh(): 36 | fake = Faker("zh_CN") 37 | sentences = [ 38 | f'我今年{num2words(random.randint(0,120), lang="ja")}歲, 而她去年{random.randint(0,120)}岁', # Age 39 | f"我住在{fake.street_address()}", # Address 40 | f"我爸是抗癌戰士。她奶奶有老人癡呆", # Disease 41 | f"李雪妮小姐331125198402010129", # Government ID 42 | f"先生,信用卡号{fake.credit_card_number()}已缴费成功", # Credit card 43 | ] 44 | for sentence in sentences: 45 | print( 46 | apply_regex_anonymization( 47 | sentence=sentence, lang_id="zh", anonymize_condition=True 48 | ) 49 | ) 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /ac_dc/visualization/README.md: -------------------------------------------------------------------------------- 1 | # Visualization tool 2 | 3 | Use this visualization tool online at https://huggingface.co/spaces/huggingface/text-data-filtering. 4 | 5 | However, by running the code on your computer, it is faster, it can handle in practice up to three times more documents, and it works for every language. 6 | 7 | 1) Use get_data_for_visualization.py to get the json gathering examples with their computed statistics for the language you chose. 8 | It uses the streaming mode of the Datasets library, so no need to download the dataset, but you have to download the fasttext model (for the language identification) and the sentencepiece / kenlm models (for the tokenization and the perplexity). 9 | 10 | 2) Specify the path to this json and the fasttext / sentencepiece / kenlm models in visualization.py and run the command "streamlit run ac_dc/visualization/visualization.py". 11 | -------------------------------------------------------------------------------- /bertin/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "RobertaForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "bos_token_id": 0, 7 | "eos_token_id": 2, 8 | "gradient_checkpointing": false, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 768, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 3072, 14 | "layer_norm_eps": 1e-05, 15 | "max_position_embeddings": 514, 16 | "model_type": "roberta", 17 | "num_attention_heads": 12, 18 | "num_hidden_layers": 12, 19 | "pad_token_id": 1, 20 | "position_embedding_type": "absolute", 21 | "transformers_version": "4.9.0.dev0", 22 | "type_vocab_size": 1, 23 | "use_cache": true, 24 | "vocab_size": 50265 25 | } 26 | -------------------------------------------------------------------------------- /bertin/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from transformers import RobertaConfig 3 | 4 | config = RobertaConfig.from_pretrained("roberta-large") 5 | config.save_pretrained("./configs/large") 6 | 7 | config = RobertaConfig.from_pretrained("roberta-base") 8 | config.save_pretrained("./configs/base") 9 | -------------------------------------------------------------------------------- /bertin/configs/base/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "RobertaForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "bos_token_id": 0, 7 | "eos_token_id": 2, 8 | "gradient_checkpointing": false, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 768, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 3072, 14 | "layer_norm_eps": 1e-05, 15 | "max_position_embeddings": 514, 16 | "model_type": "roberta", 17 | "num_attention_heads": 12, 18 | "num_hidden_layers": 12, 19 | "pad_token_id": 1, 20 | "position_embedding_type": "absolute", 21 | "transformers_version": "4.9.0.dev0", 22 | "type_vocab_size": 1, 23 | "use_cache": true, 24 | "vocab_size": 50265 25 | } 26 | -------------------------------------------------------------------------------- /bertin/configs/large/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "RobertaForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "bos_token_id": 0, 7 | "eos_token_id": 2, 8 | "gradient_checkpointing": false, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 1024, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 4096, 14 | "layer_norm_eps": 1e-05, 15 | "max_position_embeddings": 514, 16 | "model_type": "roberta", 17 | "num_attention_heads": 16, 18 | "num_hidden_layers": 24, 19 | "pad_token_id": 1, 20 | "position_embedding_type": "absolute", 21 | "transformers_version": "4.9.0.dev0", 22 | "type_vocab_size": 1, 23 | "use_cache": true, 24 | "vocab_size": 50265 25 | } 26 | -------------------------------------------------------------------------------- /bertin/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import tempfile 3 | 4 | import jax 5 | from jax import numpy as jnp 6 | from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM 7 | 8 | 9 | def to_f32(t): 10 | return jax.tree_map( 11 | lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t 12 | ) 13 | 14 | 15 | def main(): 16 | # Saving extra files from config.json and tokenizer.json files 17 | tokenizer = AutoTokenizer.from_pretrained("./") 18 | tokenizer.save_pretrained("./") 19 | 20 | # Temporary saving bfloat16 Flax model into float32 21 | tmp = tempfile.mkdtemp() 22 | flax_model = FlaxRobertaForMaskedLM.from_pretrained("./") 23 | flax_model.params = to_f32(flax_model.params) 24 | flax_model.save_pretrained(tmp) 25 | # Converting float32 Flax to PyTorch 26 | model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True) 27 | model.save_pretrained("./", save_config=False) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /bertin/evaluation/paws.yaml: -------------------------------------------------------------------------------- 1 | name: BERTIN PAWS-X es 2 | project: bertin-eval 3 | enitity: versae 4 | program: run_glue.py 5 | command: 6 | - ${env} 7 | - ${interpreter} 8 | - ${program} 9 | - ${args} 10 | method: grid 11 | metric: 12 | name: eval/accuracy 13 | goal: maximize 14 | parameters: 15 | model_name_or_path: 16 | values: 17 | - bertin-project/bertin-base-gaussian-exp-512seqlen 18 | - bertin-project/bertin-base-stepwise-exp-512seqlen 19 | - bertin-project/bertin-base-random-exp-512seqlen 20 | - bertin-project/bertin-base-gaussian 21 | - bertin-project/bertin-base-stepwise 22 | - bertin-project/bertin-base-random 23 | - bertin-project/bertin-roberta-base-spanish 24 | - flax-community/bertin-roberta-large-spanish 25 | - BSC-TeMU/roberta-base-bne 26 | - dccuchile/bert-base-spanish-wwm-cased 27 | - bert-base-multilingual-cased 28 | num_train_epochs: 29 | values: [5] 30 | task_name: 31 | value: paws-x 32 | dataset_name: 33 | value: paws-x 34 | dataset_config_name: 35 | value: es 36 | output_dir: 37 | value: ./outputs 38 | overwrite_output_dir: 39 | value: true 40 | max_seq_length: 41 | value: 512 42 | pad_to_max_length: 43 | value: true 44 | per_device_train_batch_size: 45 | value: 16 46 | per_device_eval_batch_size: 47 | value: 16 48 | save_total_limit: 49 | value: 1 50 | do_train: 51 | value: true 52 | do_eval: 53 | value: true 54 | -------------------------------------------------------------------------------- /bertin/evaluation/token.yaml: -------------------------------------------------------------------------------- 1 | name: BERTIN NER and POS es 2 | project: bertin-eval 3 | enitity: versae 4 | program: run_ner.py 5 | command: 6 | - ${env} 7 | - ${interpreter} 8 | - ${program} 9 | - ${args} 10 | method: grid 11 | metric: 12 | name: eval/accuracy 13 | goal: maximize 14 | parameters: 15 | model_name_or_path: 16 | values: 17 | - bertin-project/bertin-base-gaussian-exp-512seqlen 18 | - bertin-project/bertin-base-stepwise-exp-512seqlen 19 | - bertin-project/bertin-base-random-exp-512seqlen 20 | - bertin-project/bertin-base-gaussian 21 | - bertin-project/bertin-base-stepwise 22 | - bertin-project/bertin-base-random 23 | - bertin-project/bertin-roberta-base-spanish 24 | - flax-community/bertin-roberta-large-spanish 25 | - BSC-TeMU/roberta-base-bne 26 | - dccuchile/bert-base-spanish-wwm-cased 27 | - bert-base-multilingual-cased 28 | num_train_epochs: 29 | values: [5] 30 | task_name: 31 | values: 32 | - ner 33 | - pos 34 | dataset_name: 35 | value: conll2002 36 | dataset_config_name: 37 | value: es 38 | output_dir: 39 | value: ./outputs 40 | overwrite_output_dir: 41 | value: true 42 | pad_to_max_length: 43 | value: true 44 | per_device_train_batch_size: 45 | value: 16 46 | per_device_eval_batch_size: 47 | value: 16 48 | save_total_limit: 49 | value: 1 50 | do_train: 51 | value: true 52 | do_eval: 53 | value: true 54 | -------------------------------------------------------------------------------- /bertin/evaluation/xnli.yaml: -------------------------------------------------------------------------------- 1 | name: BERTIN XNLI es 2 | project: bertin-eval 3 | enitity: versae 4 | program: run_glue.py 5 | command: 6 | - ${env} 7 | - ${interpreter} 8 | - ${program} 9 | - ${args} 10 | method: grid 11 | metric: 12 | name: eval/accuracy 13 | goal: maximize 14 | parameters: 15 | model_name_or_path: 16 | values: 17 | - bertin-project/bertin-base-gaussian-exp-512seqlen 18 | - bertin-project/bertin-base-stepwise-exp-512seqlen 19 | - bertin-project/bertin-base-random-exp-512seqlen 20 | - bertin-project/bertin-base-gaussian 21 | - bertin-project/bertin-base-stepwise 22 | - bertin-project/bertin-base-random 23 | - bertin-project/bertin-roberta-base-spanish 24 | - flax-community/bertin-roberta-large-spanish 25 | - BSC-TeMU/roberta-base-bne 26 | - dccuchile/bert-base-spanish-wwm-cased 27 | - bert-base-multilingual-cased 28 | num_train_epochs: 29 | values: [5] 30 | task_name: 31 | value: xnli 32 | dataset_name: 33 | value: xnli 34 | dataset_config_name: 35 | value: es 36 | output_dir: 37 | value: ./outputs 38 | overwrite_output_dir: 39 | value: true 40 | max_seq_length: 41 | value: 512 42 | pad_to_max_length: 43 | value: true 44 | per_device_train_batch_size: 45 | value: 16 46 | per_device_eval_batch_size: 47 | value: 16 48 | save_total_limit: 49 | value: 1 50 | do_train: 51 | value: true 52 | do_eval: 53 | value: true 54 | -------------------------------------------------------------------------------- /bertin/events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6a6ce71bd4a3fdcb18c10bd9d140b27e746c14e9ee70a7a3faf4eedbccde1d6e 3 | size 40 4 | -------------------------------------------------------------------------------- /bertin/events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2a6926c79cb2c1941fcfe69d7b73797c15dab60e5e6f16cc6c61bd9b79a9063d 3 | size 40 4 | -------------------------------------------------------------------------------- /bertin/events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:737d1e6666fe1c9fd6dd93728666199f1a8b0b213b071bdf7b3ecd77dd58f8c1 3 | size 40 4 | -------------------------------------------------------------------------------- /bertin/get_embeddings_and_perplexity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import kenlm 3 | import numpy as np 4 | import pandas as pd 5 | from datasets import load_dataset 6 | from sentence_transformers import SentenceTransformer 7 | from tqdm import tqdm 8 | 9 | TOTAL_SENTENCES = 20000 10 | 11 | 12 | def pp(log_score, length): 13 | return 10.0 ** (-log_score / length) 14 | 15 | 16 | embedder = "distiluse-base-multilingual-cased-v1" 17 | embedder_model = SentenceTransformer(embedder) 18 | embedding_shape = embedder_model.encode(["foo"])[0].shape[0] 19 | # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin 20 | model = kenlm.Model("es.arpa.bin") 21 | mc4 = load_dataset("mc4", "es", streaming=True) 22 | count = 0 23 | embeddings = [] 24 | lenghts = [] 25 | perplexities = [] 26 | sentences = [] 27 | 28 | for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992): 29 | lines = sample["text"].split("\n") 30 | for line in lines: 31 | count += 1 32 | log_score = model.score(line) 33 | length = len(line.split()) + 1 34 | embedding = embedder_model.encode([line])[0] 35 | embeddings.append(embedding.tolist()) 36 | perplexities.append(pp(log_score, length)) 37 | lenghts.append(length) 38 | sentences.append(line) 39 | if count == TOTAL_SENTENCES: 40 | break 41 | if count == TOTAL_SENTENCES: 42 | embeddings = np.array(embeddings) 43 | df = pd.DataFrame( 44 | {"sentence": sentences, "length": lenghts, "perplexity": perplexities} 45 | ) 46 | for dim in range(embedding_shape): 47 | df[f"dim_{dim}"] = embeddings[:, dim] 48 | df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t") 49 | print("DONE!") 50 | break 51 | -------------------------------------------------------------------------------- /bertin/images/bertin-tilt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/bertin-tilt.png -------------------------------------------------------------------------------- /bertin/images/bertin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/bertin.png -------------------------------------------------------------------------------- /bertin/images/ccnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/ccnet.png -------------------------------------------------------------------------------- /bertin/images/datasets-perp-20-120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-perp-20-120.png -------------------------------------------------------------------------------- /bertin/images/datasets-perp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-perp.png -------------------------------------------------------------------------------- /bertin/images/datasets-random-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-random-comparison.png -------------------------------------------------------------------------------- /bertin/images/datasets-wsize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-wsize.png -------------------------------------------------------------------------------- /bertin/images/perp-p95.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/perp-p95.png -------------------------------------------------------------------------------- /bertin/images/perp-resample-gaussian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/perp-resample-gaussian.png -------------------------------------------------------------------------------- /bertin/images/perp-resample-stepwise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/perp-resample-stepwise.png -------------------------------------------------------------------------------- /bertin/images/random_512.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/random_512.jpg -------------------------------------------------------------------------------- /bertin/mc4/dummy/af/0.0.0/dummy_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/mc4/dummy/af/0.0.0/dummy_data.zip -------------------------------------------------------------------------------- /bertin/perplexity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import kenlm 3 | from datasets import load_dataset 4 | from tqdm import tqdm 5 | 6 | 7 | def pp(log_score, length): 8 | return 10.0 ** (-log_score / length) 9 | 10 | 11 | # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin 12 | model = kenlm.Model("es.arpa.bin") 13 | mc4 = load_dataset("mc4", "es", streaming=True) 14 | with open("mc4-es-perplexity.txt", "w") as f: 15 | for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992): 16 | lines = sample["text"].split("\n") 17 | doc_log_score, doc_length = 0, 0 18 | for line in lines: 19 | log_score = model.score(line) 20 | length = len(line.split()) + 1 21 | doc_log_score += log_score 22 | doc_length += length 23 | f.write(f"{pp(doc_log_score, doc_length)}\n") 24 | -------------------------------------------------------------------------------- /bertin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # From https://arxiv.org/pdf/1907.11692.pdf 3 | python -c "import jax; print('TPUs', jax.device_count())" 4 | ./run_mlm_flax.py \ 5 | --output_dir="./outputs" \ 6 | --model_type="roberta" \ 7 | --config_name="./configs/large" \ 8 | --tokenizer_name="./" \ 9 | --dataset_name="mc4" \ 10 | --dataset_config_name="es" \ 11 | --dataset_streamnig \ 12 | --max_seq_length="128" \ 13 | --pad_to_max_length \ 14 | --per_device_train_batch_size="128" \ 15 | --per_device_eval_batch_size="128" \ 16 | --adam_beta1="0.9" \ 17 | --adam_beta2="0.98" \ 18 | --adam_epsilon="1e-6" \ 19 | --learning_rate="4e-4" \ 20 | --weight_decay="0.01" \ 21 | --save_strategy="steps" \ 22 | --save_steps="10000" \ 23 | --save_total_limit="5" \ 24 | --warmup_steps="30000" \ 25 | --overwrite_output_dir \ 26 | --num_train_steps="500000" \ 27 | --eval_steps="10000" \ 28 | --logging_steps="500" \ 29 | --dtype="bfloat16" 2>&1 | tee run.log 30 | -------------------------------------------------------------------------------- /bertin/run_stream.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # From https://arxiv.org/pdf/1907.11692.pdf for base model 3 | python -c "import jax; print('TPUs', jax.device_count())" 4 | python ./run_mlm_flax_stream.py \ 5 | --output_dir="./outputs" \ 6 | --model_type="roberta" \ 7 | --config_name="./configs/base" \ 8 | --tokenizer_name="./configs/base" \ 9 | --dataset_name="./mc4" \ 10 | --dataset_config_name="es" \ 11 | --train_file="path/to/mc4-es-train-50M-XXX.jsonl" \ 12 | --max_seq_length="128" \ 13 | --pad_to_max_length \ 14 | --per_device_train_batch_size="256" \ 15 | --per_device_eval_batch_size="256" \ 16 | --adam_beta1="0.9" \ 17 | --adam_beta2="0.98" \ 18 | --adam_epsilon="1e-6" \ 19 | --learning_rate="6e-4" \ 20 | --weight_decay="0.01" \ 21 | --save_steps="10000" \ 22 | --save_total_limit="5" \ 23 | --warmup_steps="24000" \ 24 | --overwrite_output_dir \ 25 | --num_train_steps="250000" \ 26 | --eval_steps="10000" \ 27 | --dtype="bfloat16" \ 28 | --logging_steps="500" 2>&1 | tee run_stream.log 29 | -------------------------------------------------------------------------------- /bertin/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": "", "eos_token": "", "unk_token": "", "sep_token": "", "pad_token": "", "cls_token": "", "mask_token": {"content": "", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}} 2 | -------------------------------------------------------------------------------- /bertin/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": "", "bos_token": "", "eos_token": "", "add_prefix_space": false, "errors": "replace", "sep_token": "", "cls_token": "", "pad_token": "", "mask_token": "", "special_tokens_map_file": null, "name_or_path": "./", "tokenizer_class": "RobertaTokenizer"} 2 | -------------------------------------------------------------------------------- /bertin/tokens.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from datasets import load_dataset 3 | from tokenizers import ByteLevelBPETokenizer 4 | 5 | # Load dataset 6 | dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]") 7 | 8 | # Instantiate tokenizer 9 | tokenizer = ByteLevelBPETokenizer() 10 | 11 | 12 | def batch_iterator(batch_size=100_000): 13 | for i in range(0, len(dataset), batch_size): 14 | yield dataset["text"][i : i + batch_size] 15 | 16 | 17 | # Customized training 18 | tokenizer.train_from_iterator( 19 | batch_iterator(), 20 | vocab_size=50265, 21 | min_frequency=2, 22 | special_tokens=[ 23 | "", 24 | "", 25 | "", 26 | "", 27 | "", 28 | ], 29 | ) 30 | # Save files to disk 31 | tokenizer.save("./tokenizer.json") 32 | -------------------------------------------------------------------------------- /bertin/tokens.py.orig: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from datasets import load_dataset 3 | from tokenizers import ByteLevelBPETokenizer 4 | 5 | # Load dataset 6 | <<<<<<< HEAD 7 | dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]") 8 | 9 | # Instantiate tokenizer 10 | tokenizer = ByteLevelBPETokenizer() 11 | def batch_iterator(batch_size=100_000): 12 | ======= 13 | dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train") 14 | 15 | # Instantiate tokenizer 16 | tokenizer = ByteLevelBPETokenizer() 17 | def batch_iterator(batch_size=1_000_000): 18 | >>>>>>> d5cede47e74aa6ec36f20acf5aba37c6734c6186 19 | for i in range(0, len(dataset), batch_size): 20 | yield dataset["text"][i: i + batch_size] 21 | 22 | # Customized training 23 | tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[ 24 | "", 25 | "", 26 | "", 27 | "", 28 | "", 29 | ]) 30 | # Save files to disk 31 | tokenizer.save("./tokenizer.json") 32 | -------------------------------------------------------------------------------- /bertin/utils/dataset_perplexity.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import kenlm 4 | from tqdm import tqdm 5 | 6 | model = kenlm.Model("../es.arpa.bin") 7 | 8 | 9 | def get_perplexity(doc): 10 | doc_log_score, doc_length = 0, 0 11 | for line in doc.split("\n"): 12 | log_score = model.score(line) 13 | length = len(line.split()) + 1 14 | doc_log_score += log_score 15 | doc_length += length 16 | return 10.0 ** (-doc_log_score / doc_length) 17 | 18 | 19 | with open("mc4-es-train-50M-stats.csv", "w") as csv: 20 | with open("mc4-es-train-50M-steps.jsonl", "r") as data: 21 | for line in tqdm(data): 22 | text = json.loads(line)["text"] 23 | csv.write(f"{len(text.split())},{get_perplexity(text)}\n") 24 | -------------------------------------------------------------------------------- /bertin/utils/download_mc4es_sampled.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import io 3 | import json 4 | import sys 5 | 6 | import requests 7 | from tqdm import tqdm 8 | 9 | _DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz" 10 | 11 | 12 | def main(config="stepwise"): 13 | data_urls = [ 14 | _DATA_URL_TRAIN.format( 15 | config=config, 16 | index=index + 1, 17 | n_shards=1024, 18 | ) 19 | for index in range(1024) 20 | ] 21 | with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f: 22 | for dara_url in tqdm(data_urls): 23 | response = requests.get(dara_url) 24 | bio = io.BytesIO(response.content) 25 | with gzip.open(bio, "rt", encoding="utf8") as g: 26 | for line in g: 27 | json_line = json.loads(line.strip()) 28 | f.write(json.dumps(json_line) + "\n") 29 | 30 | 31 | if __name__ == "__main__": 32 | main(sys.argv[1]) 33 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/get_stats.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | 5 | from datasets import concatenate_datasets, load_dataset, load_from_disk 6 | from datasets.utils.logging import set_verbosity_info 7 | 8 | set_verbosity_info() 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def get_args(): 13 | parser = ArgumentParser() 14 | parser.add_argument("--dataset-path", type=str, required=True, help="Dataset path.") 15 | args = parser.parse_args() 16 | 17 | args.dataset_path = Path(args.dataset_path) 18 | return args 19 | 20 | 21 | def load_others(dataset_path: Path): 22 | others_path = dataset_path / "others" 23 | shards = [ 24 | load_from_disk(str(shard_path.absolute())) 25 | for shard_path in sorted(others_path.iterdir()) 26 | ] 27 | return concatenate_datasets(shards) 28 | 29 | 30 | def main(): 31 | # Setup logging 32 | logging.basicConfig( 33 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 34 | datefmt="%m/%d/%Y %H:%M:%S", 35 | level=logging.INFO, 36 | ) 37 | args = get_args() 38 | logger.info( 39 | f"** The job is runned with the following arguments: **\n{args}\n **** " 40 | ) 41 | 42 | others = load_others(args.dataset_path) 43 | features = others.features.copy() 44 | features.pop("compressed_warc") 45 | text_htmls = load_dataset( 46 | str((args.dataset_path / "text__html").absolute()), 47 | data_files="**.jsonl.gz", 48 | features=features, 49 | split="train", 50 | ) 51 | 52 | logger.info(f"Text/html: {len(text_htmls)}") 53 | logger.info(f"Others: {len(others)}") 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/python_scripts/check_wrong_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | bug_features = 0 4 | bug_pyarrow = 0 5 | bug_segmentation = 0 6 | bug_oom = 0 7 | bug_other = 0 8 | 9 | directory = "/Users/hugolaurencon/Desktop/HF/Code/clean_crawl/annotate_langid_crawl" 10 | for filename in os.listdir(directory): 11 | f = os.path.join(directory, filename) 12 | if os.path.isfile(f): 13 | with open(f, encoding="utf8", errors="ignore") as file: 14 | # file = open(f, 'rb') 15 | txt = file.read() 16 | # file.close() 17 | if ( 18 | "FileNotFoundError: Unable to resolve any data file that matches" in txt 19 | ) or ("Shard successfully saved" in txt): 20 | os.remove(f) 21 | elif ( 22 | "ValueError: Please pass `features` or at least one example when writing data" 23 | in txt 24 | ): 25 | bug_features += 1 26 | elif "Segmentation fault (core dumped) python" in txt: 27 | bug_segmentation += 1 28 | elif "slurmstepd: error: Detected 1 oom-kill event(s)" in txt: 29 | bug_oom += 1 30 | elif "pyarrow.lib.ArrowNotImplementedError:" in txt: 31 | bug_pyarrow += 1 32 | else: 33 | bug_other += 1 34 | print(f) 35 | 36 | print("bug_features:", bug_features) 37 | print("bug_pyarrow:", bug_pyarrow) 38 | print("bug_segmentation :", bug_segmentation) 39 | print("bug_oom:", bug_oom) 40 | print("bug_other:", bug_other) 41 | print("Tot bug:", bug_features + bug_pyarrow + bug_segmentation + bug_oom + bug_other) 42 | print("Num files:", len(os.listdir(directory))) 43 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/python_scripts/compute_stats_langid.py: -------------------------------------------------------------------------------- 1 | from datasets import load_from_disk 2 | 3 | from multiprocessing import cpu_count 4 | 5 | import json 6 | 7 | # import random # A DECOMMENTER 8 | 9 | dataset = load_from_disk( 10 | "/gpfsscratch/rech/six/urd43gx/crawl/annotated_langid_crawl" 11 | ) # "/Users/hugolaurencon/Desktop/HF/Code/dataset_filtered/af/" 12 | dataset = dataset["train"] # A COMMENTER 13 | print("Dataset loaded") 14 | 15 | dataset = dataset.map( 16 | lambda example: { 17 | "pred_lang": example["fasttext_pred"]["lang_pred_fasttext_id"], 18 | "len_text": len(example["text"]), 19 | }, # random.choice(["A", "B", "C"]) 20 | remove_columns=dataset.column_names, 21 | num_proc=cpu_count(), 22 | ) 23 | 24 | stats_langid = {} 25 | for i in range(dataset.num_rows): 26 | pred_lang = dataset[i]["pred_lang"] 27 | len_text = dataset[i]["len_text"] 28 | stats_langid[pred_lang] = stats_langid.get(pred_lang, 0) + len_text 29 | 30 | f = open( 31 | "/gpfswork/rech/six/urd43gx/code/filtering_crawl/compute_stats_langid/stats_langid.json", 32 | "w", 33 | ) # "myfile.json" 34 | json.dump(stats_langid, f) 35 | f.close() 36 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/slurm_scripts/02_detect_html_lang_attrib.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_extract_lang_tag 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=40 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/extract_lang_tag_V5/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=35,341,297 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 19 | 20 | DATASET_PATH=/gpfsscratch/rech/six/urd43gx/crawl/shards/shard_"$SLURM_ARRAY_TASK_ID" 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/fasttext_annotation/seeds_batch_1/datasets-lang-annotation/bigscience-catalogue-data 22 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/seed_id="$SLURM_ARRAY_TASK_ID" 23 | echo $DATASET_PATH 24 | pushd $DATA_TOOLING_REPO 25 | 26 | mkdir -p $SAVE_DATASET_DIR 27 | 28 | export HF_DATASETS_OFFLINE=1 29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 30 | 31 | python -m cc_pseudo_crawl.language_annotation.python_scripts.detect_html_lang_attrib \ 32 | --dataset-path $DATASET_PATH \ 33 | --num-proc 40 \ 34 | --save-path $SAVE_DATASET_PATH \ 35 | --use-datasets-caching 36 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/language_annotation/slurm_scripts/job_annotate_langid_crawl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=six@cpu 3 | #SBATCH --job-name=annotate_langid_crawl 4 | #SBATCH --partition=cpu_p1 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --output=res%A_%a 7 | #SBATCH --time=20:00:00 8 | 9 | echo "Running job on $hostname" 10 | 11 | # load conda environment 12 | source $six_ALL_CCFRWORK/start-prod 13 | conda activate hugo 14 | 15 | python /gpfswork/rech/six/urd43gx/code/filtering_crawl/annotate_langid_crawl/annotate_langid_crawl.py ${SLURM_ARRAY_TASK_ID} 16 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/check_erros_in_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from argparse import ArgumentParser 4 | 5 | from datasets import load_from_disk 6 | from datasets.utils.logging import set_verbosity_info 7 | 8 | set_verbosity_info() 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def get_args(): 13 | parser = ArgumentParser() 14 | parser.add_argument("--dataset-dir", type=str, required=True, help="Dataset name.") 15 | 16 | args = parser.parse_args() 17 | return args 18 | 19 | 20 | def main(): 21 | # Setup logging 22 | logging.basicConfig( 23 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 24 | datefmt="%m/%d/%Y %H:%M:%S", 25 | level=logging.INFO, 26 | ) 27 | args = get_args() 28 | logger.info( 29 | f"** The job is runned with the following arguments: **\n{args}\n **** " 30 | ) 31 | 32 | for dataset_name in os.listdir(args.dataset_dir): 33 | dataset_path = os.path.join(args.dataset_dir, dataset_name) 34 | try: 35 | logging.info(f"Processing: {dataset_path}") 36 | ds = load_from_disk(dataset_path) 37 | new_ds = ds.filter(keep_failed_examples) 38 | logging.info(f"Here's the subset of failed downloads: {new_ds}") 39 | except Exception as e: 40 | logging.warning(f"Failed to process {dataset_path} with error '{str(e)}'") 41 | 42 | 43 | def keep_failed_examples(example): 44 | if example["download_exception"] is None: 45 | return False 46 | return True 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/divide_in_shards.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import subprocess 4 | from argparse import ArgumentParser 5 | from pathlib import Path 6 | import sys 7 | 8 | from datasets import load_from_disk 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | # For `soup.decode_content` that can hit the limit 13 | sys.setrecursionlimit(10000) 14 | 15 | 16 | def get_args(): 17 | parser = ArgumentParser() 18 | parser.add_argument( 19 | "--dataset-path", 20 | type=str, 21 | required=True, 22 | help="path to the parquet dataset folder", 23 | ) 24 | parser.add_argument("--save-dir", type=str, help="Where to save the datasets.") 25 | parser.add_argument("--num-shards", type=int, help="Total number of shards.") 26 | args = parser.parse_args() 27 | 28 | return args 29 | 30 | 31 | def main(): 32 | # Setup logging 33 | logging.basicConfig( 34 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 35 | datefmt="%m/%d/%Y %H:%M:%S", 36 | level=logging.INFO, 37 | ) 38 | args = get_args() 39 | logger.info( 40 | f"** The job is runned with the following arguments: **\n{args}\n **** " 41 | ) 42 | 43 | ds = load_from_disk(args.dataset_path) 44 | 45 | dataset_path = Path(args.dataset_path) 46 | 47 | for shard_id in range(args.num_shards): 48 | file_name_init = dataset_path.name 49 | dataset_name, shard_id_init, num_shards_init = file_name_init.split("--") 50 | 51 | shard_id_new = int(shard_id_init) * args.num_shards + shard_id 52 | total_num_shard = int(num_shards_init) * args.num_shards 53 | shard_name = f"{dataset_name}--{shard_id_new}--{total_num_shard}" 54 | save_path = Path(os.path.join(args.save_dir, shard_name)) 55 | sub_ds = ds.shard(num_shards=args.num_shards, index=shard_id) 56 | 57 | save_path_tmp = f"{str(save_path.absolute())}.tmp" 58 | logger.info(f"Saving the dataset at {save_path_tmp}") 59 | sub_ds.save_to_disk(save_path_tmp) 60 | logger.info(f"Moving the saved dataset to {str(save_path.absolute())}") 61 | subprocess.run(["mv", save_path_tmp, str(save_path.absolute())]) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/extract_text/requirements.txt: -------------------------------------------------------------------------------- 1 | git+git@github.com:bigscience-workshop/metadata.git 2 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/load_all_seed_ids.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from argparse import ArgumentParser 3 | 4 | 5 | def get_args(): 6 | parser = ArgumentParser() 7 | parser.add_argument( 8 | "--seed-paths", 9 | type=lambda x: x.split(","), 10 | required=True, 11 | help="Seed full path. e.g. 'xxx/seeds.csv'", 12 | ) 13 | parser.add_argument("--seed-index", type=int, required=True, help="Seed index.") 14 | args = parser.parse_args() 15 | 16 | return args 17 | 18 | 19 | def main(): 20 | args = get_args() 21 | 22 | seed_ids = [] 23 | for seed_path in args.seed_paths: 24 | with open(seed_path, "r") as fi: 25 | data = csv.reader(fi) 26 | # First line is all the headers that we remove. 27 | seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0] 28 | print(seed_ids[args.seed_index]) 29 | 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/redownload_warc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | 5 | import datasets 6 | from datasets import config, load_from_disk 7 | from datasets.utils.logging import set_verbosity_info 8 | 9 | from .download_warc import download_warcs 10 | 11 | set_verbosity_info() 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def get_args(): 16 | parser = ArgumentParser() 17 | parser.add_argument("--dataset-path", type=str, required=True, help="Dataset name.") 18 | parser.add_argument("--num-proc", type=int, required=True, help="Dataset name.") 19 | parser.add_argument("--save-path", type=str, help="Where to save the datasets.") 20 | parser.add_argument("--use-datasets-caching", action="store_true") 21 | 22 | args = parser.parse_args() 23 | return args 24 | 25 | 26 | def main(): 27 | # Setup logging 28 | logging.basicConfig( 29 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 30 | datefmt="%m/%d/%Y %H:%M:%S", 31 | level=logging.INFO, 32 | ) 33 | args = get_args() 34 | logger.info( 35 | f"** The job is runned with the following arguments: **\n{args}\n **** " 36 | ) 37 | 38 | if not args.use_datasets_caching: 39 | datasets.set_caching_enabled(False) 40 | else: 41 | logger.info( 42 | f"the datasets results will be cached at {config.HF_DATASETS_CACHE}." 43 | ) 44 | 45 | ds = load_from_disk(args.dataset_path) 46 | 47 | if args.save_path: 48 | save_path = Path(args.save_path) 49 | else: 50 | save_path = Path(args.dataset_path) 51 | 52 | download_warcs(ds, save_path, num_proc=args.num_proc) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/python_scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | bs4 3 | datasets 4 | pyathena 5 | surt 6 | tldextract 7 | warcio 8 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/.gitignore: -------------------------------------------------------------------------------- 1 | sourcing_sheet_seeds/seeds.gz.parquet 2 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/DEPTH.md: -------------------------------------------------------------------------------- 1 | ## Strategy to get depth 1 2 | 3 | ### Context 4 | 5 | Once we've extract all the seed pages, we plan to make a pseudo crawl. The idea is simple: 6 | - we extract the outgoing urls from those pages. 7 | - we find the most recent record in CC matching that url (if it exists). 8 | - we do the entire processing for all the new records.pages 9 | - we update `outgoing_urls` to obtain `outgoing_ids` 10 | 11 | ### Process 12 | 13 | - 1) Make Athena query 14 | - 2) Preprocess dataset to: load_warc, obtain pdf_urls, extract external_urls 15 | - 3) Build new query with all `external_urls` 16 | - 4) Repeat 1-3 until reaching the depth we want. 17 | - 5) Finalise `finalise.py` to: generate ids, generate `external_ids` that map to rows inside dataset. 18 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/check_errors_in_dataset.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_check_erros_in_dataset 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=prepost 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/re_dowload/%x-%j.out # output file name 10 | #SBATCH --account=six@cpu 11 | 12 | set -x -e 13 | 14 | source $six_ALL_CCFRWORK/start-prod 15 | conda activate thomas_data_tooling # Debug deepspeed temporarily 16 | 17 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 18 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-redownload/bigscience-catalogue-data 19 | echo $DATASET_DIR 20 | pushd $DATA_TOOLING_REPO 21 | 22 | 23 | export HF_DATASETS_OFFLINE=1 24 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 25 | 26 | python -m cc_pseudo_crawl.python_scripts.check_erros_in_dataset \ 27 | --dataset-dir $DATASET_DIR 28 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_divide_in_subshards 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --hint=nomultithread # we get physical cores not logical 6 | #SBATCH --partition=cpu_p1 7 | #SBATCH --cpus-per-task=4 # number of cores per tasks 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/preprocess/%x-%j.out # output file name 10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-redownload/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-divide-in-subshards/bigscience-catalogue-data 22 | echo $DATASET_PATH 23 | pushd $DATA_TOOLING_REPO 24 | 25 | mkdir -p $SAVE_DATASET_DIR 26 | 27 | export HF_DATASETS_OFFLINE=1 28 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 29 | 30 | python -m cc_pseudo_crawl.python_scripts.divide_in_shards \ 31 | --dataset-path $DATASET_PATH \ 32 | --save-dir $SAVE_DATASET_DIR \ 33 | --num-shards 10 34 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards_1000.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_divide_in_subshards 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --hint=nomultithread # we get physical cores not logical 6 | #SBATCH --partition=compil 7 | #SBATCH --cpus-per-task=4 # number of cores per tasks 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=logs/%x-%j.out # output file name 10 | #SBATCH --array=0-99 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed-text-extracted/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed-text-extracted/bigscience-catalogue-data 22 | echo $DATASET_PATH 23 | pushd $DATA_TOOLING_REPO 24 | 25 | mkdir -p $SAVE_DATASET_DIR 26 | 27 | export HF_DATASETS_OFFLINE=1 28 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 29 | 30 | python -m cc_pseudo_crawl.python_scripts.divide_in_shards \ 31 | --dataset-path $DATASET_PATH \ 32 | --save-dir $SAVE_DATASET_DIR \ 33 | --num-shards 10 34 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_download 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=prepost 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=logs/%x-%j.out # output file name 10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 19 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets 20 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 21 | pushd $DATA_TOOLING_REPO 22 | 23 | # TODO run this offline 24 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/ 25 | 26 | python cc_pseudo_crawl/python_scripts/download_warc.py \ 27 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \ 28 | --cc-index-folder $CC_INDEX_FOLDER \ 29 | --save-dir $SAVE_DATASET_DIR \ 30 | --num-proc 4 \ 31 | --shard-id $SLURM_ARRAY_TASK_ID \ 32 | --num-shards $SLURM_ARRAY_TASK_COUNT 33 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_too_big.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_download_failed_shards 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=compil 8 | #SBATCH --time 14:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/download_failed_shards/%x-%j.out # output file name 10 | #SBATCH --array=3,9 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 19 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets 20 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 21 | pushd $DATA_TOOLING_REPO 22 | 23 | # TODO run this offline 24 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/ 25 | 26 | export HF_DATASETS_OFFLINE=1 27 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 28 | 29 | python cc_pseudo_crawl/python_scripts/download_warc.py \ 30 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \ 31 | --cc-index-folder $CC_INDEX_FOLDER \ 32 | --save-dir $SAVE_DATASET_DIR \ 33 | --num-proc 4 \ 34 | --shard-id $SLURM_ARRAY_TASK_ID \ 35 | --num-shards 10 \ 36 | --use-datasets-caching 37 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_4.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_download_trial_4 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=prepost 8 | #SBATCH --time 15:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/download_trial_4/%x-%j.out # output file name 10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | export DATASETS_VERBOSITY=info 19 | 20 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets 22 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 23 | pushd $DATA_TOOLING_REPO 24 | 25 | # TODO run this offline 26 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/ 27 | 28 | python cc_pseudo_crawl/python_scripts/download_warc.py \ 29 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \ 30 | --cc-index-folder $CC_INDEX_FOLDER \ 31 | --save-dir $SAVE_DATASET_DIR \ 32 | --num-proc 4 \ 33 | --shard-id $SLURM_ARRAY_TASK_ID \ 34 | --num-shards $SLURM_ARRAY_TASK_COUNT 35 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_5.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_download_trial_7 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=compil 8 | #SBATCH --time 15:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/download_trial_5/%x-%j.out # output file name 10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | export DATASETS_VERBOSITY=info 19 | 20 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets 22 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 23 | pushd $DATA_TOOLING_REPO 24 | 25 | # TODO run this offline 26 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/ 27 | 28 | python cc_pseudo_crawl/python_scripts/download_warc.py \ 29 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \ 30 | --cc-index-folder $CC_INDEX_FOLDER \ 31 | --save-dir $SAVE_DATASET_DIR \ 32 | --num-proc 4 \ 33 | --shard-id $SLURM_ARRAY_TASK_ID \ 34 | --num-shards $SLURM_ARRAY_TASK_COUNT 35 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/merge_seed_shards.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_merge_seed_shards 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=logs/merge_seed_shards/%x-%j.out # output file name 10 | #SBATCH --array=0-604 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --mail-type=ALL 12 | #SBATCH --mail-user=thomas.wang@huggingface.co 13 | #SBATCH --account=six@cpu 14 | 15 | 16 | set -x -e 17 | 18 | source $six_ALL_CCFRWORK/start-prod 19 | conda activate thomas_data_tooling # Debug deepspeed temporarily 20 | 21 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 22 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 23 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-shard-by-seed-id/bigscience-catalogue-data 24 | pushd $DATA_TOOLING_REPO 25 | 26 | SEED_ID=$(python cc_pseudo_crawl/seeds_batch_1/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/ --seed-index $SLURM_ARRAY_TASK_ID) 27 | echo "Merging all shards of seed id ${SEED_ID}" 28 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID" 29 | 30 | export HF_DATASETS_OFFLINE=1 31 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 32 | 33 | python cc_pseudo_crawl/python_scripts/merge_seed_shards.py \ 34 | --dataset-dir $DATASET_DIR \ 35 | --seed-id $SEED_ID \ 36 | --save-path $SAVE_DATASET_PATH 37 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/preprocess_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_preprocess_v4 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=40 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/preprocess-on-subshards/%x-%j.out # output file name 10 | #SBATCH --array=0-99 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-divide-in-subshards/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed/bigscience-catalogue-data 22 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 23 | echo $DATASET_PATH 24 | pushd $DATA_TOOLING_REPO 25 | 26 | mkdir -p $SAVE_DATASET_DIR 27 | 28 | export HF_DATASETS_OFFLINE=1 29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 30 | 31 | python -m cc_pseudo_crawl.python_scripts.preprocess_dataset \ 32 | --dataset-path $DATASET_PATH \ 33 | --num-proc 80 \ 34 | --save-path $SAVE_DATASET_PATH \ 35 | --use-datasets-caching \ 36 | --flavor seed 37 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/redownload_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_redownload 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=prepost 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/re_dowload/%x-%j.out # output file name 10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling # Debug deepspeed temporarily 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-redownload/bigscience-catalogue-data 22 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 23 | echo $DATASET_PATH 24 | pushd $DATA_TOOLING_REPO 25 | 26 | mkdir -p $SAVE_DATASET_DIR 27 | 28 | export HF_DATASETS_OFFLINE=1 29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 30 | 31 | python -m cc_pseudo_crawl.python_scripts.redownload_warc \ 32 | --dataset-path $DATASET_PATH \ 33 | --num-proc 4 \ 34 | --save-path $SAVE_DATASET_PATH \ 35 | --use-datasets-caching 36 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_and_compress.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_shard_and_compress 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=40 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=logs/fix_compress_cpu_p1_2/%x-%j.out # output file name 10 | #SBATCH --array=0-604 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --mail-type=ALL 12 | #SBATCH --account=six@cpu 13 | 14 | set -x -e 15 | 16 | source $six_ALL_CCFRWORK/start-prod 17 | conda activate thomas_data_tooling # Debug deepspeed temporarily 18 | 19 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 20 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 21 | 22 | pushd $DATA_TOOLING_REPO 23 | 24 | SEED_ID=$(python cc_pseudo_crawl/seeds_batch_1/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/ --seed-index $SLURM_ARRAY_TASK_ID) 25 | # SEED_IDS=( 26 | # 689 27 | # 510 28 | # 550 29 | # ) 30 | # SEED_ID=${SEED_IDS[$SLURM_ARRAY_TASK_ID]} 31 | 32 | # NODES_PER_SEED=15 33 | 34 | # INDEX_SLICE=$(python -c "print($SLURM_ARRAY_TASK_ID % $NODES_PER_SEED)") 35 | # SEED_ID=${SEED_IDS[$(python -c "print($SLURM_ARRAY_TASK_ID // $NODES_PER_SEED)")]} 36 | 37 | echo "Sharding and compressing seed id ${SEED_ID}" 38 | 39 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID" 40 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-compressed-shards/bigscience-catalogue-data/seed_id="$SEED_ID" 41 | 42 | mkdir -p $SAVE_DATASET_PATH 43 | 44 | export HF_DATASETS_OFFLINE=1 45 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 46 | 47 | python cc_pseudo_crawl/python_scripts/shard_and_compress.py \ 48 | --dataset-path $DATASET_PATH \ 49 | --max-size 10_000_000_000 \ 50 | --num-proc 4 \ 51 | --save-num-proc 10 \ 52 | --save-path $SAVE_DATASET_PATH \ 53 | --save-batch-size 10 54 | # --index-slice $INDEX_SLICE 55 | # --total-number-slice $NODES_PER_SEED 56 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_by_seed_id.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_shard_by_id 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 04:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=logs/%x-%j.out # output file name 10 | #SBATCH --array=0-999 # TODO: modify according to the number of models you want to evaluated 11 | #SBATCH --mail-type=ALL 12 | #SBATCH --mail-user=thomas.wang@huggingface.co 13 | #SBATCH --account=six@cpu 14 | 15 | set -x -e 16 | 17 | source $six_ALL_CCFRWORK/start-prod 18 | conda activate thomas_data_tooling # Debug deepspeed temporarily 19 | 20 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc 21 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 22 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed-text-extracted/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--1000 23 | SAVE_DATASET_PREFIX_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-shard-by-seed-id/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--1000 24 | pushd $DATA_TOOLING_REPO 25 | 26 | mkdir -p $(dirname $SAVE_DATASET_PREFIX_PATH) 27 | 28 | export HF_DATASETS_OFFLINE=1 29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 30 | 31 | python cc_pseudo_crawl/python_scripts/shard_by_seed_id.py \ 32 | --dataset-path $DATASET_PATH \ 33 | --num-proc 4 \ 34 | --save-prefix-path $SAVE_DATASET_PREFIX_PATH 35 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/README.md: -------------------------------------------------------------------------------- 1 | # Pseudo-Crawl Data Sourcing Candidate Seeds Spreadsheet 2 | 3 | Source: https://docs.google.com/spreadsheets/d/1DNLAGz--qvLh-0qQ7pMPGiNeUMgp-fRgn-8mbLagC7U/edit#gid=513216703 (timestamp 2021-11-28, reverted edits by anonymous user on record 16 - diariovasco.com), exported as [candidate_websites_for_crawling.csv](./candidate_websites_for_crawling.csv) 4 | 5 | Steps: 6 | 7 | 1. run [cleanup-seeds](./cleanup-seeds.ipynb) to prepare a clean seed list 8 | 9 | 2. do the lookups / table join, see [general instructions](../README.md) using the crawl selector `CC-MAIN-202[01]` to restrict the join for the last 2 years 10 | 11 | 3. prepare [coverage metrics](./cc-metrics.ipynb) 12 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1_2/00_clean_dataset.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_clean_dataset 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=40 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_1_2/logs/clean_dataset-v2/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-613 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 19 | 20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl 21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1_2/datasets-clean/bigscience-catalogue-data 22 | echo $DATASET_PATH 23 | pushd $DATA_TOOLING_REPO 24 | 25 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py \ 26 | --seed-paths "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/seeds.csv,"$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv \ 27 | --seed-index $SLURM_ARRAY_TASK_ID \ 28 | ) 29 | 30 | mkdir -p $SAVE_DATASET_DIR 31 | 32 | export HF_DATASETS_OFFLINE=1 33 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 34 | 35 | python cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py \ 36 | --seed-id $SEED_ID \ 37 | --save-dir $SAVE_DATASET_DIR \ 38 | --pseudo_crawl_path $DATASET_PATH \ 39 | --batch-size 10 \ 40 | --save-batch-size 10 \ 41 | --num-proc 10 \ 42 | --min-chars 32 \ 43 | --n-records 10000 \ 44 | --pourcentage-threshold 0.01 \ 45 | --min-repetition-threshold 10 46 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_1_2/01_exact_deduplicates.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_deduplicate 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 2:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_1_2/logs/deduplicate-on-clean-v2/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-613 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 19 | 20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1_2/datasets-clean/bigscience-catalogue-data 21 | 22 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py \ 23 | --seed-paths "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/seeds.csv,"$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv \ 24 | --seed-index $SLURM_ARRAY_TASK_ID \ 25 | ) 26 | 27 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1_2/datasets-deduplicate-on-clean-v2/bigscience-catalogue-data/lm_change_lang_id_seed_id_${SEED_ID}_pseudocrawl_change_name 28 | echo $DATASET_PATH 29 | pushd $DATA_TOOLING_REPO 30 | 31 | mkdir -p $SAVE_DATASET_DIR 32 | 33 | export HF_DATASETS_OFFLINE=1 34 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 35 | 36 | python cc_pseudo_crawl/python_scripts/exact_deduplicates.py \ 37 | --seed-id $SEED_ID \ 38 | --save-dir $SAVE_DATASET_DIR \ 39 | --pseudo_crawl_path $DATASET_PATH \ 40 | --batch-size 1000 \ 41 | --save-batch-size 1000 \ 42 | --num-proc 8 43 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/.gitignore: -------------------------------------------------------------------------------- 1 | sourcing_sheet_seeds/seeds_batch_2.gz.parquet 2 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/README.md: -------------------------------------------------------------------------------- 1 | # Extracting Content from Common Crawl for 2nd Curated List of Sites 2 | 3 | This folder gathers the scripts necessary to create the extension of the pseudo crawl dataset with the new seeds: 4 | - https://www.bbc.com/swahili 5 | - https://www.bbc.com/gahuza 6 | - https://www.bbc.com/igbo 7 | - https://www.bbc.com/yoruba 8 | - https://yo.globalvoices.org 9 | - https://ig.globalvoices.org 10 | - https://www.dw.com/sw 11 | - https://www.mwananchi.co.tz 12 | - https://www.voaswahili.com 13 | - https://www.voaswahili.com/ 14 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/01_download_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_download_v1 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=prepost 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-99 #TODO set correct number 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc 19 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets 20 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 21 | pushd $DATA_TOOLING_REPO 22 | 23 | mkdir -p $SAVE_DATASET_DIR 24 | 25 | # TODO run this offline 26 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/ 27 | 28 | export HF_DATASETS_OFFLINE=1 29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 30 | 31 | python cc_pseudo_crawl/python_scripts/download_warc.py \ 32 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \ 33 | --cc-index-folder $CC_INDEX_FOLDER \ 34 | --save-dir $SAVE_DATASET_DIR \ 35 | --num-proc 4 \ 36 | --shard-id $SLURM_ARRAY_TASK_ID \ 37 | --num-shards $SLURM_ARRAY_TASK_COUNT \ 38 | --flavor "seeds_batch_2" \ 39 | --use-datasets-caching 40 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02_redownload_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_redownload 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=compil 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=81-99%5 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc 19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 20 | 21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data 23 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 24 | echo $DATASET_PATH 25 | pushd $DATA_TOOLING_REPO 26 | 27 | mkdir -p $SAVE_DATASET_DIR 28 | 29 | export HF_DATASETS_OFFLINE=1 30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 31 | 32 | python -m cc_pseudo_crawl.python_scripts.redownload_warc \ 33 | --dataset-path $DATASET_PATH \ 34 | --num-proc 1 \ 35 | --save-path $SAVE_DATASET_PATH \ 36 | --use-datasets-caching 37 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02b_redownload_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_redownload 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=compil 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/02b/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-99%5 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc 19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling 20 | 21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data 23 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 24 | echo $DATASET_PATH 25 | pushd $DATA_TOOLING_REPO 26 | 27 | mkdir -p $SAVE_DATASET_DIR 28 | 29 | export HF_DATASETS_OFFLINE=1 30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 31 | 32 | python -m cc_pseudo_crawl.python_scripts.redownload_warc \ 33 | --dataset-path $DATASET_PATH \ 34 | --num-proc 1 \ 35 | --save-path $SAVE_DATASET_PATH \ 36 | --use-datasets-caching 37 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/03_check_errors_in_dataset.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_check_erros_in_dataset 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --account=six@cpu 11 | 12 | set -x -e 13 | 14 | source $six_ALL_CCFRWORK/start-prod 15 | conda activate thomas_data_tooling 16 | 17 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling #TODO change path if necessary 18 | 19 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data 20 | echo $DATASET_DIR 21 | pushd $DATA_TOOLING_REPO 22 | 23 | 24 | export HF_DATASETS_OFFLINE=1 25 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 26 | 27 | python -m cc_pseudo_crawl.python_scripts.check_erros_in_dataset \ 28 | --dataset-dir $DATASET_DIR 29 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/04_divide_in_subshards.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_divide_in_subshards 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --hint=nomultithread # we get physical cores not logical 6 | #SBATCH --partition=cpu_p1 7 | #SBATCH --cpus-per-task=4 # number of cores per tasks 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/divice_in_subshards/logs/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-99 #TODO set correct number 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc #TODO change path if necessary 19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling #TODO change path if necessary 20 | 21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-divide-in-subshards/bigscience-catalogue-data 23 | echo $DATASET_PATH 24 | pushd $DATA_TOOLING_REPO 25 | 26 | mkdir -p $SAVE_DATASET_DIR 27 | 28 | export HF_DATASETS_OFFLINE=1 29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 30 | 31 | python -m cc_pseudo_crawl.python_scripts.divide_in_shards \ 32 | --dataset-path $DATASET_PATH \ 33 | --save-dir $SAVE_DATASET_DIR \ 34 | --num-shards 10 35 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/05_preprocess_warc.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_preprocess 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=40 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/preprocess_warc/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-99 #TODO set correct number 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc #TODO change path if necessary 19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling #TODO change path if necessary 20 | 21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-divide-in-subshards/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-preprocessed/bigscience-catalogue-data 23 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT" 24 | echo $DATASET_PATH 25 | pushd $DATA_TOOLING_REPO 26 | 27 | mkdir -p $SAVE_DATASET_DIR 28 | 29 | export HF_DATASETS_OFFLINE=1 30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 31 | 32 | python -m cc_pseudo_crawl.python_scripts.preprocess_dataset \ 33 | --dataset-path $DATASET_PATH \ 34 | --num-proc 80 \ 35 | --save-path $SAVE_DATASET_PATH \ 36 | --use-datasets-caching \ 37 | --flavor seed 38 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/07_shard_by_seed_id.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_shard_by_id 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 04:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/shard_by_seed_id/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-99 #TODO set correct number 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc 19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 20 | 21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-preprocessed-text-extracted/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 22 | SAVE_DATASET_PREFIX_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-shard-by-seed-id/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100 23 | pushd $DATA_TOOLING_REPO 24 | 25 | mkdir -p $(dirname $SAVE_DATASET_PREFIX_PATH) 26 | 27 | export HF_DATASETS_OFFLINE=1 28 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 29 | 30 | python cc_pseudo_crawl/python_scripts/shard_by_seed_id.py \ 31 | --dataset-path $DATASET_PATH \ 32 | --num-proc 4 \ 33 | --save-prefix-path $SAVE_DATASET_PREFIX_PATH 34 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/08_merge_seed_shards.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_merge_seed_shards 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/merge_seed_shards/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-99 #TODO set correct number 11 | #SBATCH --account=six@cpu 12 | 13 | 14 | set -x -e 15 | 16 | source $six_ALL_CCFRWORK/start-prod 17 | conda activate thomas_data_tooling 18 | 19 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc 20 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling 21 | 22 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-shard-by-seed-id/bigscience-catalogue-data 23 | pushd $DATA_TOOLING_REPO 24 | 25 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv --seed-index $SLURM_ARRAY_TASK_ID) 26 | echo "Merging all shards of seed id ${SEED_ID}" 27 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID" 28 | 29 | export HF_DATASETS_OFFLINE=1 30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 31 | 32 | python cc_pseudo_crawl/python_scripts/merge_seed_shards.py \ 33 | --dataset-dir $DATASET_DIR \ 34 | --seed-id $SEED_ID \ 35 | --save-path $SAVE_DATASET_PATH 36 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/09_shard_and_compress.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo_crawl_shard_and_compress 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=40 # number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --partition=cpu_p1 8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) 9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/shard_and_compress/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --array=0-8 11 | #SBATCH --account=six@cpu 12 | 13 | set -x -e 14 | 15 | source $six_ALL_CCFRWORK/start-prod 16 | conda activate thomas_data_tooling 17 | 18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc #TODO change path if necessary 19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling #TODO change path if necessary 20 | 21 | pushd $DATA_TOOLING_REPO 22 | 23 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv --seed-index $SLURM_ARRAY_TASK_ID) 24 | # SEED_IDS=( 25 | # 689 26 | # 510 27 | # 550 28 | # ) 29 | # SEED_ID=${SEED_IDS[$SLURM_ARRAY_TASK_ID]} 30 | 31 | # NODES_PER_SEED=15 32 | 33 | # INDEX_SLICE=$(python -c "print($SLURM_ARRAY_TASK_ID % $NODES_PER_SEED)") 34 | # SEED_ID=${SEED_IDS[$(python -c "print($SLURM_ARRAY_TASK_ID // $NODES_PER_SEED)")]} 35 | 36 | echo "Sharding and compressing seed id ${SEED_ID}" 37 | 38 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID" 39 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/seed_id="$SEED_ID" 40 | 41 | mkdir -p $SAVE_DATASET_PATH 42 | 43 | export HF_DATASETS_OFFLINE=1 44 | export HF_DATASETS_CACHE=$SCRATCH/to_delete 45 | 46 | python cc_pseudo_crawl/python_scripts/shard_and_compress.py \ 47 | --dataset-path $DATASET_PATH \ 48 | --max-size 10_000_000_000 \ 49 | --num-proc 4 \ 50 | --save-num-proc 10 \ 51 | --save-path $SAVE_DATASET_PATH \ 52 | --save-batch-size 10 53 | # --index-slice $INDEX_SLICE 54 | # --total-number-slice $NODES_PER_SEED 55 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/slurm_scripts/10_push_to_hub.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=pseudo-crawl-push-to-hub # (change me!) job name 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! 5 | #SBATCH --cpus-per-task=4 # (change me! between 0 and 48) number of cores per tasks 6 | #SBATCH --hint=nomultithread # we get physical cores not logical 7 | #SBATCH --time 10:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) 8 | #SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus 9 | #SBATCH --output=logs/%x-%j.out # output file name #TODO change path if necessary 10 | #SBATCH --account=six@cpu # account 11 | #SBATCH -p compil # partition with internet 12 | 13 | set -x -e 14 | 15 | source $HOME/start-modelling-metadata-user 16 | 17 | # mv $six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/* $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/ not used 18 | 19 | rsync -vam -f'+ *.jsonl.gz' -f'+ */' -f'- *' $six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/* $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/ 20 | 21 | # ls $six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/ -f'+ *.jsonl.gz' | xargs -n1 -P8 -I% rsync -Pa % $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/ not used 22 | 23 | 24 | cd $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/ 25 | 26 | git status 27 | 28 | for seed_id in {698..708} 29 | do 30 | echo "Add seed id n°$seed_id" 31 | git add -v *seed_id="$seed_id"/*.gz 32 | done 33 | 34 | git commit -v -m "add depth 0 dataset with html content extracted" 35 | git push -v 36 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv: -------------------------------------------------------------------------------- 1 | id 2 | 698 3 | 699 4 | 700 5 | 701 6 | 702 7 | 703 8 | 704 9 | 705 10 | 706 11 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.csv: -------------------------------------------------------------------------------- 1 | uid,type,description,link,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey,id 2 | bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili,/swahili,www.bbc.com,bbc.com,"com,bbc)/swahili",698 3 | bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza,/gahuza,www.bbc.com,bbc.com,"com,bbc)/gahuza",699 4 | bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo,/igbo,www.bbc.com,bbc.com,"com,bbc)/igbo",700 5 | bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba,/yoruba,www.bbc.com,bbc.com,"com,bbc)/yoruba",701 6 | global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org/,/,yo.globalvoices.org,globalvoices.org,"org,globalvoices,yo)/",702 7 | global_voices_igbo,primary,{'homepage': 'https://ig.globalvoices.org'},https://ig.globalvoices.org/,/,ig.globalvoices.org,globalvoices.org,"org,globalvoices,ig)/",703 8 | dw_swahili,primary,{'homepage': 'https://www.dw.com/sw'},https://www.dw.com/sw,/sw,www.dw.com,dw.com,"com,dw)/sw",704 9 | mwananchi_,primary,{'homepage': 'https://www.mwananchi.co.tz'},https://www.mwananchi.co.tz/,/,www.mwananchi.co.tz,mwananchi.co.tz,"tz,co,mwananchi)/",705 10 | voa_swahili,primary,{'homepage': 'https://www.voaswahili.com'},https://www.voaswahili.com/,/,www.voaswahili.com,voaswahili.com,"com,voaswahili)/",706 11 | -------------------------------------------------------------------------------- /cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "uid": "bbc_swahili", 4 | "type": "primary", 5 | "description": { 6 | "homepage": "https://www.bbc.com/swahili" 7 | } 8 | }, 9 | { 10 | "uid": "bbc_gahuza", 11 | "type": "primary", 12 | "description": { 13 | "homepage": "https://www.bbc.com/gahuza" 14 | } 15 | }, 16 | { 17 | "uid": "bbc_igbo", 18 | "type": "primary", 19 | "description": { 20 | "homepage": "https://www.bbc.com/igbo" 21 | } 22 | }, 23 | { 24 | "uid": "bbc_yoruba", 25 | "type": "primary", 26 | "description": { 27 | "homepage": "https://www.bbc.com/yoruba" 28 | } 29 | }, 30 | { 31 | "uid": "global_voices_yoruba", 32 | "type": "primary", 33 | "description": { 34 | "homepage": "https://yo.globalvoices.org" 35 | } 36 | }, 37 | { 38 | "uid": "global_voices_igbo", 39 | "type": "primary", 40 | "description": { 41 | "homepage": "https://ig.globalvoices.org" 42 | } 43 | }, 44 | { 45 | "uid": "dw_swahili", 46 | "type": "primary", 47 | "description": { 48 | "homepage": "https://www.dw.com/sw" 49 | } 50 | }, 51 | { 52 | "uid": "mwananchi_", 53 | "type": "primary", 54 | "description": { 55 | "homepage": "https://www.mwananchi.co.tz" 56 | } 57 | }, 58 | { 59 | "uid": "voa_swahili", 60 | "type": "primary", 61 | "description": { 62 | "homepage": "https://www.voaswahili.com" 63 | } 64 | } 65 | 66 | ] 67 | -------------------------------------------------------------------------------- /index_search/README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch index search experiments 2 | 3 | Early tests to build upon HuggingFace datasets to improving indexing/Search capabilities. 4 | 5 | ## Pre-requisites 6 | 7 | Elasticsearch is launched in cluster through docker so go install Docker if not already done: https://docs.docker.com/get-docker/ 8 | 9 | The example is based on a forked version of dataset and some additional dependencies. Use `requirements.txt` to install all the necessary stuff. A conda en 10 | 11 | ## Run 12 | 13 | * Go into the `index_search` folder and start Elasticsearch cluster 14 | 15 | ``` 16 | cd ./index_search 17 | docker compose up 18 | ``` 19 | 20 | * Run the python script 21 | 22 | ``` 23 | python datasets_index_search.py 24 | ``` 25 | 26 | Note that it will start a ray instance which might require some ports to be open for local communication. 27 | 28 | ## TODO list 29 | 30 | Improve datasets indexing capabilities 31 | - [x] test switch to ngram indexing 32 | - [x] add hash for each rows 33 | - [x] parallel processing using ray and dataset shards 34 | - [x] enable re-connection to existing index in ES 35 | - [x] enable continuing indexing process 36 | - [x] ensure no duplicate with mmh3 hash 37 | - [x] instantiate datasets from elasticsearch query 38 | - [x] clear cache when instantiating with new query 39 | - [ ] validate dataset info are propagated 40 | - [ ] check scalability 41 | - ~~allow export of search results in arrow for datasets or jsonl for export => specialized filter operation?~~ 42 | - [ ] secure elasticsearch cluster: free read, protected write 43 | - [x] allow update on the dataset to be reflected with index update 44 | -------------------------------------------------------------------------------- /index_search/datasets_ES_builder.py: -------------------------------------------------------------------------------- 1 | import simplejson as json 2 | from datasets.packaged_modules.elasticsearch.elasticsearch import ElasticsearchBuilder 3 | 4 | ca_file = "/Users/gdupont/src/github.com/bigscience-workshop/data-tooling/index_search/ca.cert" 5 | with open( 6 | "/Users/gdupont/src/github.com/bigscience-workshop/data-tooling/index_search/credentials.json" 7 | ) as f: 8 | credentials = json.load(f) 9 | 10 | the_host = credentials["connection"]["https"]["hosts"][0]["hostname"] 11 | the_port = credentials["connection"]["https"]["hosts"][0]["port"] 12 | 13 | username = credentials["connection"]["https"]["authentication"]["username"] 14 | psw = credentials["connection"]["https"]["authentication"]["password"] 15 | 16 | index_name = "oscar_unshuffled_deduplicated" 17 | oscar_lang_code = "nn" 18 | 19 | elasticsearch_builder = ElasticsearchBuilder( 20 | host=the_host, 21 | port=the_port, 22 | es_username=username, 23 | es_psw=psw, 24 | ca_file=ca_file, 25 | es_index_name=index_name, 26 | es_index_config=None, 27 | query="mykje arbeid og slit", 28 | ) 29 | 30 | # elasticsearch_builder = ElasticsearchBuilder( 31 | # host="localhost", 32 | # port="9200", 33 | # es_index_name="oscar_unshuffled_deduplicated", 34 | # es_index_config=es_index_config, 35 | # query='"mykje arbeid og slit"' 36 | # ) 37 | 38 | elasticsearch_builder.download_and_prepare() 39 | 40 | oscar_dataset_filtered = elasticsearch_builder.as_dataset() 41 | print(oscar_dataset_filtered.keys()) 42 | 43 | first_split = next(iter(oscar_dataset_filtered)) 44 | 45 | for i in range(0, 5): 46 | print( 47 | f"- [#{oscar_dataset_filtered[first_split]['id'][i]}] {oscar_dataset_filtered[first_split]['text'][i]}" 48 | ) 49 | -------------------------------------------------------------------------------- /index_search/datasets_ES_search.py: -------------------------------------------------------------------------------- 1 | import simplejson as json 2 | from datasets import load_dataset 3 | 4 | ca_file = "./ca.cert" 5 | 6 | with open("./credentials.json") as f: 7 | credentials = json.load(f) 8 | 9 | the_host = credentials["connection"]["https"]["hosts"][0]["hostname"] 10 | the_port = credentials["connection"]["https"]["hosts"][0]["port"] 11 | 12 | username = credentials["connection"]["https"]["authentication"]["username"] 13 | psw = credentials["connection"]["https"]["authentication"]["password"] 14 | 15 | index_name = "oscar_unshuffled_deduplicated" 16 | oscar_lang_code = "nn" 17 | 18 | my_dataset = load_dataset( 19 | "oscar", f"unshuffled_deduplicated_{oscar_lang_code}", split="train" 20 | ) 21 | 22 | my_dataset.load_elasticsearch_index( 23 | index_name=index_name, 24 | host=the_host, 25 | port=the_port, 26 | es_username=username, 27 | es_psw=psw, 28 | ca_file=ca_file, 29 | es_index_name=index_name, 30 | es_index_config=None, 31 | ) 32 | 33 | print(my_dataset) 34 | 35 | K = 10 36 | scores, retrieved = my_dataset.get_nearest_examples( 37 | index_name, "mykje arbeid og slit", k=K 38 | ) 39 | 40 | for i in range(0, min(K, len(retrieved))): 41 | print(f"({i + 1})") 42 | print(f'\t@{scores[i]:.2f} - {retrieved["id"][i]} => {retrieved["text"][i]} \n') 43 | -------------------------------------------------------------------------------- /index_search/datasets_remote_ES_IBMcloud.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import ssl 3 | 4 | import simplejson as json 5 | from elasticsearch import Elasticsearch 6 | 7 | with open("./credentials.json") as f: 8 | credentials = json.load(f) 9 | 10 | host = credentials["connection"]["https"]["hosts"][0]["hostname"] 11 | port = credentials["connection"]["https"]["hosts"][0]["port"] 12 | 13 | es_username = credentials["connection"]["https"]["authentication"]["username"] 14 | es_psw = credentials["connection"]["https"]["authentication"]["password"] 15 | 16 | ca_cert = base64.b64decode( 17 | credentials["connection"]["https"]["certificate"]["certificate_base64"] 18 | ) 19 | # context = ssl.create_default_context() 20 | # context.verify_mode = ssl.CERT_REQUIRED 21 | # context.load_verify_locations(cadata=ca_cert) 22 | 23 | context = ssl.create_default_context(cafile="./ca.cert") 24 | 25 | server_url = ( 26 | ("https" if context is not None else "http") + "://" + host + ":" + str(port) 27 | ) 28 | 29 | es = Elasticsearch([server_url], http_auth=(es_username, es_psw), ssl_context=context) 30 | 31 | print(f"ES info {json.dumps(es.info(), indent=4 * ' ')}") 32 | 33 | # index_get_response = es.indices.get(index='oscar_unshuffled_deduplicated') 34 | # print(json.dumps(index_get_response, indent=4 * ' ')) 35 | 36 | delete_response = es.indices.delete(index="oscar_unshuffled_deduplicated") 37 | print(json.dumps(delete_response, indent=4 * " ")) 38 | -------------------------------------------------------------------------------- /index_search/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.2' 2 | services: 3 | es01: 4 | image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2 5 | container_name: es01 6 | environment: 7 | - node.name=es01 8 | - cluster.name=es-docker-cluster 9 | - discovery.seed_hosts=es02,es03 10 | - cluster.initial_master_nodes=es01,es02,es03 11 | - bootstrap.memory_lock=true 12 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 13 | ulimits: 14 | memlock: 15 | soft: -1 16 | hard: -1 17 | volumes: 18 | - data01:/usr/share/elasticsearch/data 19 | ports: 20 | - 9200:9200 21 | networks: 22 | - elastic 23 | 24 | es02: 25 | image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2 26 | container_name: es02 27 | environment: 28 | - node.name=es02 29 | - cluster.name=es-docker-cluster 30 | - discovery.seed_hosts=es01,es03 31 | - cluster.initial_master_nodes=es01,es02,es03 32 | - bootstrap.memory_lock=true 33 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 34 | ulimits: 35 | memlock: 36 | soft: -1 37 | hard: -1 38 | volumes: 39 | - data02:/usr/share/elasticsearch/data 40 | networks: 41 | - elastic 42 | 43 | es03: 44 | image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2 45 | container_name: es03 46 | environment: 47 | - node.name=es03 48 | - cluster.name=es-docker-cluster 49 | - discovery.seed_hosts=es01,es02 50 | - cluster.initial_master_nodes=es01,es02,es03 51 | - bootstrap.memory_lock=true 52 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 53 | ulimits: 54 | memlock: 55 | soft: -1 56 | hard: -1 57 | volumes: 58 | - data03:/usr/share/elasticsearch/data 59 | networks: 60 | - elastic 61 | 62 | kib01: 63 | image: docker.elastic.co/kibana/kibana:7.13.2 64 | container_name: kib01 65 | ports: 66 | - 5601:5601 67 | environment: 68 | ELASTICSEARCH_URL: http://es01:9200 69 | ELASTICSEARCH_HOSTS: '["http://es01:9200","http://es02:9200","http://es03:9200"]' 70 | networks: 71 | - elastic 72 | 73 | volumes: 74 | data01: 75 | driver: local 76 | data02: 77 | driver: local 78 | data03: 79 | driver: local 80 | 81 | networks: 82 | elastic: 83 | driver: bridge 84 | -------------------------------------------------------------------------------- /index_search/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/ggdupont/datasets@bigscience_datatooling#egg=datasets 2 | elasticsearch==7.10.1 3 | iso-639==0.4.5 4 | ray~=1.4.1 5 | simplejson 6 | -------------------------------------------------------------------------------- /kenlm_training/.gitignore: -------------------------------------------------------------------------------- 1 | # Dataset 2 | /data 3 | /test_data/ 4 | /test_data2/ 5 | /output/ 6 | 7 | # Binary files 8 | /bin/ 9 | 10 | # Third party code 11 | /third_party/ 12 | 13 | # Generic to python 14 | __pycache__/ 15 | *.pyc 16 | .mypy_cache/ 17 | 18 | /scratch/ 19 | /notebooks/ 20 | 21 | /build/ 22 | /cc_net.egg-info/ 23 | /config/ 24 | /dist/ 25 | /pip-wheel-metadata/ 26 | 27 | /.DS_Store 28 | -------------------------------------------------------------------------------- /kenlm_training/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /kenlm_training/cc_net/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | -------------------------------------------------------------------------------- /kenlm_training/cc_net/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | 8 | import func_argparse 9 | 10 | import cc_net.mine 11 | 12 | 13 | def main(): 14 | func_argparse.parse_and_call(cc_net.mine.get_main_parser()) 15 | 16 | 17 | if __name__ == "__main__": 18 | main() 19 | -------------------------------------------------------------------------------- /kenlm_training/cc_net/data/test_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "2019-09/de_head_0000.json.gz": { 3 | "size": 5264993, 4 | "checksum": "fc12ba3dc982ef06e7e44a916f298e1c16f9a806" 5 | }, 6 | "2019-09/de_middle_0000.json.gz": { 7 | "size": 9195535, 8 | "checksum": "2369ff0296ab1d924c81083f17ce41f22a10ad69" 9 | }, 10 | "2019-09/de_tail_0000.json.gz": { 11 | "size": 33029074, 12 | "checksum": "18865040a7263242d298958f358f7cb5511114d4" 13 | }, 14 | "2019-09/fr_head_0000.json.gz": { 15 | "size": 4076580, 16 | "checksum": "4eef4017bbbe042fc01c45b5fbcf94de49f5138e" 17 | }, 18 | "2019-09/fr_middle_0000.json.gz": { 19 | "size": 8075095, 20 | "checksum": "fd251a5b924c4aa66a63c375ca3a8fae23b3273b" 21 | }, 22 | "2019-09/fr_tail_0000.json.gz": { 23 | "size": 27248949, 24 | "checksum": "4a8aed38abc6b9d04459e8d424bd47426f063638" 25 | }, 26 | "2019-09/it_head_0000.json.gz": { 27 | "size": 1760696, 28 | "checksum": "e5e50e49b4a5147ea82b385babd5c83f74d2a4ed" 29 | }, 30 | "2019-09/it_middle_0000.json.gz": { 31 | "size": 4461832, 32 | "checksum": "7daab7b7acb93d81e50534196ada4e94947b8224" 33 | }, 34 | "2019-09/it_tail_0000.json.gz": { 35 | "size": 14754298, 36 | "checksum": "1adc018519a598ff162261d7e480ea41d3458768" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /kenlm_training/cc_net/get_hf_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import func_argparse 4 | from datasets import load_dataset 5 | from tqdm import tqdm 6 | 7 | from cc_net import text_normalizer 8 | 9 | 10 | def dl( 11 | dataset: str, 12 | output_file: str, 13 | name: Optional[str] = None, 14 | data_dir: Optional[str] = None, 15 | data_files: Optional[str] = None, 16 | split: Optional[str] = None, 17 | streaming: bool = True, 18 | accent: bool = False, 19 | case: bool = False, 20 | numbers: bool = True, 21 | punct: int = 1, 22 | max_docs: Optional[int] = None, 23 | seed: int = 0, 24 | buffer_size: int = 10000, 25 | ): 26 | """Download dataset from the Hugging Face hub.""" 27 | dataset = load_dataset( 28 | dataset, 29 | name=name, 30 | data_dir=data_dir, 31 | data_files=data_files, 32 | split=split, 33 | streaming=streaming, 34 | ) 35 | dataset_norm = dataset.map( 36 | lambda x: text_normalizer.normalize( 37 | x["text"], accent=accent, case=case, numbers=numbers, punct=punct 38 | ) 39 | ) 40 | dataset_norm = dataset_norm.shuffle(buffer_size=buffer_size, seed=seed) 41 | count = 0 42 | with open(output_file, "w") as o: 43 | with tqdm(total=max_docs) as pbar: 44 | for doc in dataset_norm: 45 | count += 1 46 | doc = doc.rstrip("\n") 47 | print(doc, file=o) 48 | if max_docs and count == max_docs: 49 | break 50 | pbar.update(1) 51 | 52 | 53 | if __name__ == "__main__": 54 | func_argparse.main(dl) 55 | -------------------------------------------------------------------------------- /kenlm_training/cc_net/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/kenlm_training/cc_net/tools/__init__.py -------------------------------------------------------------------------------- /kenlm_training/config/lid_exp.json: -------------------------------------------------------------------------------- 1 | { 2 | "output_dir": "/checkpoint/guw/cc_clean2/", 3 | "dump": "2019-09", 4 | "num_shards": 1600, 5 | "pipeline": [ 6 | "lid_before_dedup", 7 | "dedup", 8 | "lid_after_dedup" 9 | ], 10 | "hash_in_mem": 50, 11 | "execution": "slurm" 12 | } 13 | -------------------------------------------------------------------------------- /kenlm_training/config/mine_segment.json: -------------------------------------------------------------------------------- 1 | { 2 | "dump": "2019-09", 3 | "mined_dir": "mini_by_segment", 4 | "pipeline": [ 5 | "dedup", 6 | "lid", 7 | "keep_lang", 8 | "sp", 9 | "lm", 10 | "pp_bucket", 11 | "minify", 12 | "split_by_segment" 13 | ], 14 | "execution": "slurm" 15 | } 16 | -------------------------------------------------------------------------------- /kenlm_training/config/test_reproduce.json: -------------------------------------------------------------------------------- 1 | { 2 | "hash_in_mem": 2, 3 | "dump": "2019-09", 4 | "num_shards": 4, 5 | "num_segments_per_shard": 1, 6 | "pipeline": [ 7 | "fetch_metadata", 8 | "split_by_lang" 9 | ], 10 | "metadata": "test_data2/mined_by_segment", 11 | "execution": "debug", 12 | "output_dir": "test_data2", 13 | "mined_dir": "reproduce", 14 | "target_size": "32M", 15 | "cache_dir": "test_data/wet_cache" 16 | } 17 | -------------------------------------------------------------------------------- /kenlm_training/config/test_segment.json: -------------------------------------------------------------------------------- 1 | { 2 | "hash_in_mem": 2, 3 | "dump": "2019-09", 4 | "num_shards": 4, 5 | "num_segments_per_shard": 1, 6 | "mine_num_processes": 0, 7 | "lang_whitelist": ["de", "it", "fr"], 8 | "pipeline": [ 9 | "dedup", 10 | "lid", 11 | "keep_lang", 12 | "sp", 13 | "lm", 14 | "pp_bucket", 15 | "minify", 16 | "split_by_segment" 17 | ], 18 | "execution": "debug", 19 | "output_dir": "test_data2", 20 | "mined_dir": "mined_by_segment", 21 | "target_size": "32M", 22 | "cache_dir": "test_data/wet_cache" 23 | } 24 | -------------------------------------------------------------------------------- /kenlm_training/pyproject.toml: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = "tests" 3 | 4 | [tool.black] 5 | line-length = 88 6 | target_version = ["py37"] 7 | 8 | [tool.isort] 9 | multi_line_output = 3 10 | include_trailing_comma = true 11 | force_grid_wrap = 0 12 | use_parentheses = true 13 | line_length = 88 14 | known_third_party = ["func_argparse"] 15 | skip = ["third_party", "data"] 16 | 17 | [mypy] 18 | python_version = 3.7 19 | check_untyped_defs = true 20 | 21 | [mypy-numpy] 22 | ignore_missing_imports = true 23 | [mypy-pytest] 24 | ignore_missing_imports = true 25 | -------------------------------------------------------------------------------- /kenlm_training/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from pathlib import Path 7 | 8 | from setuptools import setup # type: ignore 9 | 10 | setup( 11 | name="cc_net", 12 | version="1.0.0", 13 | packages=["cc_net"], 14 | # metadata to display on PyPI 15 | author="Guillaume Wenzek", 16 | author_email="guw@fb.com", 17 | description="Tools to download and clean Common Crawl", 18 | keywords="common crawl dataset", 19 | url="https://github.com/facebookresearch/cc_net", 20 | license="CC-BY-NC-4.0", 21 | long_description=Path("README.md").read_text(), 22 | long_description_content_type="text/markdown", 23 | project_urls={ 24 | "Bug Tracker": "https://github.com/facebookresearch/cc_net/issues", 25 | "Source Code": "https://github.com/facebookresearch/cc_net", 26 | }, 27 | classifiers=[ 28 | "Development Status :: 4 - Beta", 29 | "Programming Language :: Python :: 3.7", 30 | ], 31 | python_requires=">=3.7", 32 | install_requires=[ 33 | "beautifulsoup4>=4.7.1", 34 | "pandas>=0.23.4", 35 | "requests>=2.22.0", 36 | "fasttext>=0.9.1", 37 | "sentencepiece>=0.1.82", 38 | "kenlm @ git+https://github.com/kpu/kenlm.git@master", 39 | "func_argparse>=1.1.1", 40 | "psutil>=5.6.3", 41 | "sacremoses", 42 | "submitit>=1.0.0", 43 | "typing_extensions", 44 | "datasets==1.16.1", 45 | ], 46 | extras_require={ 47 | "dev": ["mypy==0.790", "pytest", "black==19.3b0", "isort==5.6.4"], 48 | # To use scripts inside cc_net/tools 49 | "tools": ["lxml", "sentence_splitter"], 50 | # Memory-efficient hashset. 51 | # This fork only compiles the kind of dict used by cc_net. 52 | # Full version is at https://github.com/atom-moyer/getpy 53 | "getpy": ["getpy @ git+https://github.com/gwenzek/getpy.git@v0.9.10-subset"], 54 | }, 55 | package_data={"cc_net": ["data/*"]}, 56 | ) 57 | -------------------------------------------------------------------------------- /kenlm_training/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # 7 | -------------------------------------------------------------------------------- /kenlm_training/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import pytest 8 | 9 | 10 | def _request_is_disabled(self, *args, **kwargs): 11 | raise Exception( 12 | f"Your code tried to call 'request' with: {args}, {kwargs}. Unit test aren't allowed to reach internet." 13 | ) 14 | 15 | 16 | @pytest.fixture(autouse=True) 17 | def no_requests(monkeypatch): 18 | """Remove requests.sessions.Session.request for all tests.""" 19 | monkeypatch.setattr("requests.sessions.Session.request", _request_is_disabled) 20 | -------------------------------------------------------------------------------- /kenlm_training/tests/test_normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import cc_net.text_normalizer as txt 8 | 9 | 10 | def test_unicode_punct(): 11 | weird = ",。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%" 12 | replaced = ',.,""""""""""\'::?!();- - . ~\'...-<>[]%' 13 | assert txt.replace_unicode_punct(weird) == replaced 14 | 15 | assert txt.remove_unicode_punct(weird) == "" 16 | 17 | 18 | def test_numbers(): 19 | weird = "023456789 | 0123456789" 20 | normalized = "000000000 | 0000000000" 21 | assert txt.normalize(weird, numbers=True) == normalized 22 | assert txt.normalize(weird, numbers=False) == weird 23 | 24 | 25 | def test_normalize_for_dedup(): 26 | weird = "023´∶:\x10 | ;012 hèllo" 27 | normalized = "000 | ;000 hèllo" 28 | assert normalized == txt.slow_normalize_for_dedup(weird) 29 | assert normalized == txt.normalize_for_dedup(weird) 30 | -------------------------------------------------------------------------------- /kenlm_training/tests/test_parse_wet_file.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | from pathlib import Path 8 | 9 | from cc_net import process_wet_file 10 | 11 | 12 | def test_parsing(): 13 | sample = Path(__file__).parent / "data" / "sample.warc.txt" 14 | with open(sample) as f: 15 | documents = list(process_wet_file.parse_warc_file(f)) 16 | 17 | expected_urls = [ 18 | "http://sample_english.com", 19 | "http://sample_chinese.zh", 20 | "http://sample_russian.ru", 21 | ] 22 | assert expected_urls == [d["url"] for d in documents] 23 | expected_domains = ["sample_english.com", "sample_chinese.zh", "sample_russian.ru"] 24 | assert expected_domains == [d["source_domain"] for d in documents] 25 | 26 | expected_date = [ 27 | "2019-03-18T00:00:00Z", 28 | "2019-03-18T00:00:01Z", 29 | "2019-03-18T00:00:02Z", 30 | ] 31 | assert expected_date == [d["date_download"] for d in documents] 32 | 33 | expected_title = [ 34 | "Famous Mark Twain Quotes", 35 | "馬克·吐溫名言", 36 | "Цитаты знаменитого Марка Твена", 37 | ] 38 | assert expected_title == [d["title"] for d in documents] 39 | 40 | expected_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live. 41 | Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge. 42 | 43 | Facts are stubborn things, but statistics are more pliable. 44 | Fiction is obliged to stick to possibilities. Truth isn't. 45 | """ 46 | 47 | assert expected_quotes == documents[0]["raw_content"] 48 | -------------------------------------------------------------------------------- /kenlm_training/tests/test_regroup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import time 8 | 9 | from cc_net import jsonql, regroup 10 | 11 | 12 | def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False): 13 | n_shards = 4 14 | n_docs = 20 15 | shards = [ 16 | [dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs)] 17 | for s in range(n_shards) 18 | ] 19 | shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)] 20 | for shard, shard_file in zip(shards, shards_files): 21 | jsonql.run_pipes(inputs=shard, output=shard_file) 22 | regroup_file = tmp_path / "regroup.json.gz" 23 | start = time.time() 24 | regroup_fn(shards_files, regroup_file) 25 | duration = time.time() - start 26 | print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s") 27 | 28 | regrouped = list(jsonql.read_jsons(regroup_file)) 29 | assert [doc for shard in shards for doc in shard] == regrouped 30 | 31 | readers = jsonql.get_block_readers(regroup_file, n_shards) 32 | if not check_blocks_boundaries: 33 | assert [doc for shard in shards for doc in shard] == [ 34 | doc for reader in readers for doc in jsonql.read_jsons(reader) 35 | ] 36 | return 37 | 38 | for shard, reader in zip(shards, readers): 39 | block = [doc for doc in jsonql.read_jsons(reader)] 40 | assert shard == block 41 | 42 | 43 | def test_regroup(tmp_path): 44 | # With regroup boundaries will be every 256Mb. 45 | check_regroup(tmp_path, regroup.reshard, check_blocks_boundaries=False) 46 | 47 | 48 | def test_fast_regroup(tmp_path): 49 | # With fast regroup boundaries should match the shards. 50 | check_regroup(tmp_path, regroup.fast_reshard, check_blocks_boundaries=True) 51 | -------------------------------------------------------------------------------- /perplexity_lenses/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Perplexity Lenses 3 | emoji: 🌸 4 | colorFrom: pink 5 | colorTo: blue 6 | sdk: streamlit 7 | app_file: app.py 8 | pinned: false 9 | --- 10 | 11 | # Installation: 12 | Requires Python >= 3.7 and < 3.10 13 | ``` 14 | pip install . 15 | ``` 16 | Or with [poetry](https://python-poetry.org/) 17 | ``` 18 | poetry install 19 | ``` 20 | 21 | # Web App: 22 | The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally: 23 | ``` 24 | python -m streamlit run app.py 25 | ``` 26 | 27 | # CLI: 28 | The CLI with no arguments defaults to running mc4 in Spanish. 29 | For full usage: 30 | ``` 31 | python cli.py --help 32 | ``` 33 | Example: Running on 1000 sentences extracted from Spanish OSCAR docs specifying all arguments: 34 | ``` 35 | python cli.py \ 36 | --dataset oscar \ 37 | --dataset-config unshuffled_deduplicated_es \ 38 | --dataset-split train \ 39 | --text-column text \ 40 | --language es \ 41 | --doc-type sentence \ 42 | --sample 1000 \ 43 | --dimensionality-reduction umap \ 44 | --model-name distiluse-base-multilingual-cased-v1 \ 45 | --output-file perplexity.html 46 | ``` 47 | # Tests: 48 | ``` 49 | python -m unittest discover -s ./tests/ -p "test_*.py" 50 | ``` 51 | -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | REGISTRY_DATASET = "mhtoin/register_oscar" 3 | -------------------------------------------------------------------------------- /perplexity_lenses/perplexity_lenses/visualization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.figure 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | 6 | def draw_histogram( 7 | values: np.ndarray, 8 | cutoff_x_axis: float = 2000.0, 9 | title: str = "Perplexity histogram", 10 | xlabel: str = "Perplexity", 11 | ) -> matplotlib.figure.Figure: 12 | hist_values = values[values < cutoff_x_axis] 13 | fig, ax = plt.subplots(figsize=(12, 9)) 14 | ax.hist(hist_values, bins=50) 15 | ax.set_title(title) 16 | ax.set_xlabel(xlabel) 17 | ax.set_ylabel("Counts") 18 | return fig 19 | -------------------------------------------------------------------------------- /perplexity_lenses/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "perplexity-lenses" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["edugp "] 6 | 7 | [tool.poetry.dependencies] 8 | python = ">=3.7,<3.10" 9 | huggingface-hub = "0.0.19" 10 | streamlit = "1.1.0" 11 | transformers = "4.11.3" 12 | watchdog = "2.1.3" 13 | sentence-transformers = "2.0.0" 14 | bokeh = "2.2.2" 15 | numpy = "1.20.0" 16 | numba = "^0.54.1" 17 | umap-learn = "^0.5.2" 18 | datasets = "1.14.0" 19 | black = "^21.10b0" 20 | flake8 = "^4.0.1" 21 | scikit-learn = "0.24.2" 22 | kenlm = {url = "https://github.com/kpu/kenlm/archive/master.zip"} 23 | embedding-lenses = "0.9.0" 24 | typer = "^0.4.0" 25 | 26 | [tool.poetry.dev-dependencies] 27 | pytest = "^5.2" 28 | 29 | [tool.poetry.scripts] 30 | cli = "cli:app" 31 | 32 | [tool.black] 33 | target-version = ["py38"] 34 | 35 | [tool.isort] 36 | profile = "black" 37 | line_length = 160 38 | multi_line_output = 3 39 | include_trailing_comma = true 40 | 41 | [build-system] 42 | requires = ["poetry-core>=1.0.0"] 43 | build-backend = "poetry.core.masonry.api" 44 | -------------------------------------------------------------------------------- /perplexity_lenses/requirements.txt: -------------------------------------------------------------------------------- 1 | bokeh==2.2.2 2 | embedding-lenses==0.9.0 3 | https://github.com/kpu/kenlm/archive/master.zip 4 | huggingface-hub==0.0.19 5 | numpy==1.20.0 6 | sentence-transformers==2.0.0 7 | streamlit==1.1.0 8 | transformers==4.11.3 9 | typer==0.4.0 10 | umap-learn==0.5.2 11 | watchdog==2.1.3 12 | -------------------------------------------------------------------------------- /perplexity_lenses/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/perplexity_lenses/tests/__init__.py -------------------------------------------------------------------------------- /perplexity_lenses/tests/test_data.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from perplexity_lenses.data import documents_df_to_sentences_df 6 | 7 | 8 | class TestData(unittest.TestCase): 9 | def test_documents_df_to_sentences_df(self): 10 | input_df = pd.DataFrame({"text": ["foo\nbar"]}) 11 | expected_output_df = pd.DataFrame({"text": ["foo", "bar"]}) 12 | output_df = documents_df_to_sentences_df(input_df, "text", 100) 13 | pd.testing.assert_frame_equal( 14 | output_df, expected_output_df, check_like=True, check_exact=True 15 | ) 16 | -------------------------------------------------------------------------------- /pii-manager/CHANGES.md: -------------------------------------------------------------------------------- 1 | v. 0.5.0 2 | * new task list parsing code, adding a "full" format based on dicts, in 3 | addition to the previous "simplified" format based on tuples 4 | * refactored to allow more than one task for a given PII and country 5 | * added the capability to add task descriptors programmatically 6 | * added reading task descriptors from a JSON file 7 | * context validation spec, for all three task implementation types 8 | * TASK_ANY split into LANG_ANY & COUNTRY_ANY 9 | * PII detectors for international phone numbers, for en-any & es-any 10 | * PII detector for IP addresses, language independent 11 | * PII detectors for GOV_ID 12 | - lang pt, countries PT & BR 13 | - lang es, country MX 14 | 15 | v. 0.4.0 16 | * PII GOV_ID task for es-ES and en-AU 17 | * PII EMAIL_ADDRESS task 18 | * PyPi Makefile targets; fixed setup.py 19 | 20 | v. 0.3.0 21 | * new processing mode: `full` 22 | * PII detectors for zh-CN 23 | * added `regex` as dependency 24 | * `regex` used for regular expression tasks instead of `re` 25 | 26 | v. 0.2.0 27 | * Added PII tasks: 28 | - en: GOV_ID for US, CA, IN 29 | - fr: GOV_ID for CA 30 | * fix paths for languages/countries that are reserved Python words (is, in) 31 | * added country information to PiiEntity 32 | * added an _asdict() function for PiiEntities 33 | * added PII country to task_info 34 | * miscellaneous fixes 35 | -------------------------------------------------------------------------------- /pii-manager/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /pii-manager/README.md: -------------------------------------------------------------------------------- 1 | # Pii Manager 2 | 3 | This repository builds a Python package that performs PII processing for text 4 | data i.e. replacement/tagging/extraction of PII (Personally Identifiable 5 | Information aka [Personal Data]) items existing in the text. 6 | 7 | The PII Tasks in the package are structured by language & country, since many 8 | of the PII elements are language- and/or -country dependent. 9 | 10 | ## Requirements 11 | 12 | The package needs at least Python 3.8, and uses the [python-stdnum] package to 13 | validate identifiers. 14 | 15 | ## Usage 16 | 17 | The package can be used: 18 | * As an API, in two flavors: function-based API and object-based API 19 | * As a command-line tool 20 | 21 | For details, see the [usage document]. 22 | 23 | 24 | ## Building 25 | 26 | The provided [Makefile] can be used to process the package: 27 | * `make pkg` will build the Python package, creating a file that can be 28 | installed with `pip` 29 | * `make unit` will launch all unit tests (using [pytest], so pytest must be 30 | available) 31 | * `make install` will install the package in a Python virtualenv. The 32 | virtualenv will be chosen as, in this order: 33 | - the one defined in the `VENV` environment variable, if it is defined 34 | - if there is a virtualenv activated in the shell, it will be used 35 | - otherwise, a default is chosen as `/opt/venv/bigscience` (it will be 36 | created if it does not exist) 37 | 38 | 39 | ## Contributing 40 | 41 | To add a new PII processing task, please see the [contributing instructions]. 42 | 43 | 44 | [python-stdnum]: https://github.com/arthurdejong/python-stdnum 45 | [Makefile]: Makefile 46 | [pytest]: https://docs.pytest.org 47 | [contributing instructions]: doc/contributing.md 48 | [usage document]: doc/usage.md 49 | [Personal Data]: https://en.wikipedia.org/wiki/Personal_data 50 | -------------------------------------------------------------------------------- /pii-manager/doc/external.md: -------------------------------------------------------------------------------- 1 | # Adding external task processors to a processing object 2 | 3 | In addition to the task processorts contained inside the [lang] subfolders in 4 | the package, it is also possible to add _external_ task processors define 5 | outside the package, as long as they comply with the [task specification]. 6 | This can be done for both the object-base API and the file-based API. 7 | 8 | 9 | ## Object-based API 10 | 11 | An instantiated `ProcManager` object contains the `add_tasks` method. This 12 | method will accept a list of [task descriptors] with the same syntax as the 13 | internal `PII_TASKS` descriptors, and will add the tasks defined in them to 14 | the existing ones in the object. 15 | 16 | 17 | ## File-based API 18 | 19 | The file-based `process_file` function allows a `taskfile` argument. This 20 | argument will contain the name of a JSON file that contains an array of task 21 | descriptors. Each task descriptor in the array is a JSON object following the 22 | specification for [task descriptors], with these differences: 23 | 24 | * The `pii` field is not a `PiiEnum` object, but a string with the _name_ of 25 | a `PiiEnum` object. It will be converted to the object itself. 26 | * The `task` field contains: 27 | - for `regex` types, the string with the regular expression pattern to be 28 | compiled (beware of escaping all backlashes in the string) 29 | - for `callable` and `PiiTask` types, a string with the **fully 30 | qualified** name of the function to be used or class to be instantiated. 31 | As long as that name can be located in the running Python space (i.e. 32 | it is in the load path), it will be imported and used. 33 | 34 | 35 | [lang]: ../src/pii_manager/lang 36 | [task specification]: tasks.md 37 | [task descriptors]: contributing.md#task-descriptor 38 | -------------------------------------------------------------------------------- /pii-manager/requirements.txt: -------------------------------------------------------------------------------- 1 | python-stdnum >=1.17,<2.0 2 | regex >= 2021.11.10 3 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = "0.5.0" 2 | 3 | from .piienum import PiiEnum 4 | from .piientity import PiiEntity 5 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .manager import PiiManager 2 | from .file import process_file 3 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/app/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/__init__.py: -------------------------------------------------------------------------------- 1 | from .taskdict import get_taskdict, country_list 2 | from .base import BasePiiTask 3 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/exception.py: -------------------------------------------------------------------------------- 1 | class PiiManagerException(Exception): 2 | def __init__(self, msg, *args): 3 | super().__init__(msg.format(*args)) 4 | 5 | 6 | class InvArgException(PiiManagerException): 7 | pass 8 | 9 | 10 | class PiiUnimplemented(PiiManagerException): 11 | pass 12 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide a custom JSON encoder that can serialize additional objects, 3 | in particular PiiEntity objects 4 | """ 5 | 6 | 7 | from collections.abc import Iterator 8 | import datetime 9 | import json 10 | 11 | 12 | def keygetter_set(v): 13 | return str(v).lower() 14 | 15 | 16 | class CustomJSONEncoder(json.JSONEncoder): 17 | """ 18 | A custom JSON encoder that can serialize additional objects: 19 | - datetime objects (into ISO 8601 strings) 20 | - sets (as sorted lists) 21 | - iterators (as lists) 22 | - any object having a to_json() method that produces a string or 23 | a serializable object 24 | 25 | Non-serializable objects are converted to plain strings. 26 | """ 27 | 28 | def default(self, obj): 29 | """ 30 | Serialize some special types 31 | """ 32 | if hasattr(obj, "to_json"): 33 | return obj.to_json() 34 | elif isinstance(obj, datetime.datetime): 35 | t = obj.strftime("%Y-%m-%dT%H:%M:%S.%f%z") 36 | if obj.tzinfo is not None: 37 | t = t[:-2] + ":" + t[-2:] 38 | return t 39 | elif isinstance(obj, set): 40 | return sorted(obj, key=keygetter_set) 41 | elif isinstance(obj, Iterator): 42 | return list(obj) 43 | 44 | try: 45 | return super().default(self, obj) 46 | except TypeError: 47 | return str(obj) 48 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/normalizer.py: -------------------------------------------------------------------------------- 1 | def normalize( 2 | text: str, lang: str, whitespace: bool = True, lowercase: bool = False 3 | ) -> str: 4 | """ 5 | Perforn some normalization steps on a text string 6 | """ 7 | if whitespace: 8 | text = " ".join(text.split()) 9 | 10 | if lowercase: 11 | text = text.lower() 12 | 13 | return text 14 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/helper/types.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | TYPE_STR_LIST = Union[str, List[str]] 4 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/__init__.py: -------------------------------------------------------------------------------- 1 | # Folder for language-independent tasks 2 | LANG_ANY = "any" 3 | 4 | # Country-independent tasks 5 | COUNTRY_ANY = "any" 6 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/any/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/bitcoin_address.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find valid bitcoin addresses 3 | 1. Obtain candidates, by using a generic regex expression 4 | 2. Validate candidates by 5 | - using a more exact regex 6 | - validating the number through the Luhn algorithm 7 | """ 8 | 9 | import re 10 | 11 | from typing import Iterable 12 | 13 | from stdnum import bitcoin 14 | 15 | from pii_manager import PiiEnum 16 | 17 | # ---------------------------------------------------------------------------- 18 | 19 | # regex for the three types of bitcoin addresses 20 | _BITCOIN_PATTERN = ( 21 | r"( [13] [" 22 | + bitcoin._base58_alphabet 23 | + "]{25,34}" 24 | + "| bc1 [" 25 | + bitcoin._bech32_alphabet 26 | + "]{8,87})" 27 | ) 28 | 29 | _REGEX_BITCOIN = re.compile(_BITCOIN_PATTERN, flags=re.X) 30 | 31 | 32 | def bitcoin_address(text: str) -> Iterable[str]: 33 | """ 34 | Bitcoin addresses (P2PKH, P2SH and Bech32), recognize & validate 35 | """ 36 | # Find and validate candidates 37 | for ba in _REGEX_BITCOIN.findall(text): 38 | if bitcoin.is_valid(ba): 39 | yield ba 40 | 41 | 42 | # --------------------------------------------------------------------- 43 | 44 | PII_TASKS = [(PiiEnum.BITCOIN_ADDRESS, bitcoin_address)] 45 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/email.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of email addresses 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | 7 | 8 | _EMAIL_PATTERN = r"[\w\.=-]+ @ [\w\.-]+ \. [\w]{2,3}" 9 | 10 | 11 | PII_TASKS = [(PiiEnum.EMAIL_ADDRESS, _EMAIL_PATTERN, "Email address")] 12 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/any/ip_address.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of IP addresses 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | 7 | 8 | _IP_PATTERN = r""" 9 | \b 10 | (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \. ){3} 11 | (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]?) 12 | \b 13 | """ 14 | 15 | 16 | PII_TASKS = [ 17 | { 18 | "pii": PiiEnum.IP_ADDRESS, 19 | "type": "regex", 20 | "task": _IP_PATTERN, 21 | "name": "ip address", 22 | "doc": "match IP addresses, with context", 23 | "context": {"value": "ip", "type": "word", "width": 16}, 24 | } 25 | ] 26 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/any/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/any/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/any/international_phone_number.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of phone numbers written with international notation (i.e. with 3 | prefix and country code) 4 | """ 5 | 6 | 7 | from pii_manager import PiiEnum 8 | 9 | PATTERN_INT_PHONE = r""" 10 | (?:\+ | 00) 11 | (?: 9[976]\d | 8[987530]\d | 6[987]\d | 5[90]\d | 42\d | 12 | 3[875]\d | 2[98654321]\d | 9[8543210] | 8[6421] | 13 | 6[6543210] | 5[87654321] | 4[987654310] | 3[9643210] | 14 | 2[70] | 7 | 1) 15 | [-\x20\.]? 16 | (?: \d{2,3} [-\x20]? ){3,4} 17 | """ 18 | 19 | PII_TASKS = [ 20 | { 21 | "pii": PiiEnum.PHONE_NUMBER, 22 | "type": "regex", 23 | "task": PATTERN_INT_PHONE, 24 | "name": "international phone number", 25 | "doc": "detect phone numbers that use international notation. Uses context", 26 | "context": {"value": ["ph", "phone", "fax"], "width": [16, 0], "type": "word"}, 27 | } 28 | ] 29 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/au/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/au/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/au/abn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection and validation of Australian business number (ABN). 3 | 4 | """ 5 | import re 6 | 7 | from stdnum.au import abn 8 | 9 | from typing import Iterable 10 | 11 | from pii_manager import PiiEnum 12 | 13 | 14 | _ABN_PATTERN = r"\b (?: \d{2} \s \d{3} \s \d{3} \s \d{3} | \d{11} ) \b" 15 | _ABN_REGEX = re.compile(_ABN_PATTERN, flags=re.X) 16 | 17 | 18 | def australian_business_number(doc: str) -> Iterable[str]: 19 | """ 20 | Australian Business Number (detect and validate) 21 | """ 22 | for candidate in _ABN_REGEX.findall(doc): 23 | if abn.is_valid(candidate): 24 | yield candidate 25 | 26 | 27 | PII_TASKS = [(PiiEnum.GOV_ID, australian_business_number)] 28 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/au/tfn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection and validation of Australian Tax File Number (TFN). 3 | 4 | """ 5 | import re 6 | 7 | from stdnum.au import tfn 8 | 9 | from typing import Iterable 10 | 11 | from pii_manager import PiiEnum 12 | 13 | 14 | _TFN_PATTERN = r"\b (?: \d{3} \s \d{3} \s \d{3} | \d{8,9} ) \b" 15 | _TFN_REGEX = re.compile(_TFN_PATTERN, flags=re.X) 16 | 17 | 18 | def tax_file_number(doc: str) -> Iterable[str]: 19 | """ 20 | Australian Tax File Number (detect and validate) 21 | """ 22 | for candidate in _TFN_REGEX.findall(doc): 23 | if tfn.is_valid(candidate): 24 | yield candidate 25 | 26 | 27 | PII_TASKS = [(PiiEnum.GOV_ID, tax_file_number)] 28 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/ca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/ca/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection and validation of Canadian Social Insurance Number 3 | 4 | Since it contains a check digit, it can be validated. 5 | """ 6 | 7 | import re 8 | 9 | from stdnum.ca import sin 10 | 11 | from typing import Iterable 12 | 13 | from pii_manager import PiiEnum 14 | 15 | 16 | _SIN_REGEX = re.compile(r"\d{3}[-\ ]\d{3}[-\ ]\d{3}", flags=re.X) 17 | 18 | 19 | def social_insurance_number(doc: str) -> Iterable[str]: 20 | """ 21 | Canadian Social Insurance Number (detect and validate) 22 | """ 23 | for candidate in _SIN_REGEX.findall(doc): 24 | if sin.is_valid(candidate): 25 | yield candidate 26 | 27 | 28 | PII_TASKS = [(PiiEnum.GOV_ID, social_insurance_number)] 29 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/in_/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/in_/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/in_/aadhaar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection and validation of Indian Aadhaar identity number 3 | 4 | Since it contains a check digit, it can be validated. 5 | """ 6 | 7 | import re 8 | 9 | from stdnum.in_ import aadhaar 10 | 11 | from typing import Iterable 12 | 13 | from pii_manager import PiiEnum 14 | 15 | 16 | _AADHAAR_REGEX = re.compile(r"[2-9]\d{3}\ ?\d{4}\ ?\d{4}", flags=re.X) 17 | 18 | 19 | def aadhaar_number(doc: str) -> Iterable[str]: 20 | """ 21 | Aadhaar identity number from India (detect and validate) 22 | """ 23 | for candidate in _AADHAAR_REGEX.findall(doc): 24 | if aadhaar.is_valid(candidate): 25 | yield candidate 26 | 27 | 28 | PII_TASKS = [(PiiEnum.GOV_ID, aadhaar_number)] 29 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/us/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/us/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/en/us/social_security_number.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of U.S. Social Security Number. 3 | 4 | We just match on the number, it cannot be 5 | validated using only the number since it does not carry a checksum 6 | """ 7 | 8 | from pii_manager import PiiEnum 9 | 10 | 11 | _SSN_PATTERN = r"(?!000|666|333)0*(?:[0-6][0-9][0-9]|[0-7][0-6][0-9]|[0-7][0-7][0-2])[-\ ](?!00)[0-9]{2}[-\ ](?!0000)[0-9]{4}" 12 | 13 | 14 | PII_TASKS = [ 15 | (PiiEnum.GOV_ID, _SSN_PATTERN, "U.S. Social Security Number (detect only)") 16 | ] 17 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/any/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/any/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/any/international_phone_number.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of phone numbers written with international notation (i.e. with 3 | prefix and country code), for ES 4 | """ 5 | 6 | 7 | from pii_manager import PiiEnum 8 | 9 | # The pattern for the regex is the same as for English 10 | from ...en.any.international_phone_number import PATTERN_INT_PHONE 11 | 12 | 13 | PII_TASKS = [ 14 | { 15 | "pii": PiiEnum.PHONE_NUMBER, 16 | "type": "regex", 17 | "task": PATTERN_INT_PHONE, 18 | "name": "international phone number", 19 | "doc": "detect phone numbers that use international notation. Uses language context", 20 | "context": { 21 | "value": ["tf", "teléfono", "telefono"], 22 | "width": [16, 0], 23 | "type": "word", 24 | }, 25 | } 26 | ] 27 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/es/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/es/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/es/bank_account.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spanish bank account numbers (CCC - código cuenta cliente) 3 | 4 | Note: **NOT** IBAN numbers, those are country (& language) independent 5 | """ 6 | 7 | import re 8 | 9 | from typing import Iterable 10 | 11 | from stdnum.es import ccc 12 | 13 | from pii_manager import PiiEnum 14 | 15 | # ---------------------------------------------------------------------------- 16 | 17 | # regex for a Código Cuenta Cliente, with optional spaces separating the pieces 18 | _CCC_PATTERN = r"\d{4}\s?\d{4}\s?\d{2}\s?\d{10}" 19 | 20 | # compiled regex 21 | _REGEX_CCC = None 22 | 23 | 24 | def spanish_bank_ccc(text: str) -> Iterable[str]: 25 | """ 26 | Spanish Bank Accounts (código cuenta cliente, 10-digit code, pre-IBAN), recognize & validate 27 | """ 28 | # Compile regex if needed 29 | global _REGEX_CCC 30 | if _REGEX_CCC is None: 31 | _REGEX_CCC = re.compile(_CCC_PATTERN, flags=re.X) 32 | # Find all CCCs 33 | for item in _REGEX_CCC.findall(text): 34 | if ccc.is_valid(item): 35 | yield item 36 | 37 | 38 | # --------------------------------------------------------------------- 39 | 40 | PII_TASKS = [(PiiEnum.BANK_ACCOUNT, spanish_bank_ccc)] 41 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/es/govid.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spanish Goverment-issued IDs: 3 | - DNI (Documento Nacional de Identidad) 4 | - NIE (Número de Identificación de Extranjero) 5 | """ 6 | 7 | import re 8 | 9 | from typing import Iterable 10 | 11 | from stdnum.es import dni, nie 12 | 13 | from pii_manager import PiiEnum, PiiEntity 14 | from pii_manager.helper import BasePiiTask 15 | 16 | # regex for DNI & NIE 17 | _DNI_PATTERN = r"\d{6,8} -? [A-KJ-NP-TV-Z]" 18 | _NIE_PATTERN = r"[X-Z] \d{7} -? [A-KJ-NP-TV-Z]" 19 | 20 | 21 | class SpanishDniNie(BasePiiTask): 22 | """ 23 | Spanish Government-issued DNI & NIE numbers, recognize & validate 24 | """ 25 | 26 | pii_name = "Spanish DNI and NIE numbers" 27 | 28 | def __init__(self, **kwargs): 29 | super().__init__(**kwargs) 30 | # Compile the regexes 31 | self.dni = re.compile(_DNI_PATTERN, flags=re.X) 32 | self.nie = re.compile(_NIE_PATTERN, flags=re.X) 33 | 34 | def find(self, doc: str) -> Iterable[PiiEntity]: 35 | # DNI 36 | for item in self.dni.finditer(doc): 37 | item_value = item.group() 38 | if dni.is_valid(item_value): 39 | yield PiiEntity( 40 | PiiEnum.GOV_ID, 41 | item.start(), 42 | item_value, 43 | country=self.country, 44 | name="Spanish DNI", 45 | ) 46 | # NIE 47 | for item in self.nie.finditer(doc): 48 | item_value = item.group() 49 | if nie.is_valid(item_value): 50 | yield PiiEntity( 51 | PiiEnum.GOV_ID, 52 | item.start(), 53 | item_value, 54 | country=self.country, 55 | name="Spanish NIE", 56 | ) 57 | 58 | 59 | # Task descriptor 60 | PII_TASKS = [(PiiEnum.GOV_ID, SpanishDniNie)] 61 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/mx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/mx/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/es/mx/curp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection and validation of Clave Única de Registro de Población for Mexico 3 | 4 | It contains two check digits, so it can be validated. 5 | """ 6 | 7 | import re 8 | 9 | from stdnum.mx import curp as stdnum_curp 10 | 11 | from typing import Iterable 12 | 13 | from pii_manager import PiiEnum 14 | 15 | 16 | _CURP_PATTERN = r"[A-Z] [AEIOU] [A-Z]{2} \d{6} [HM] [A-Z]{5} [0-9A-Z] \d" 17 | _CURP_REGEX = re.compile(_CURP_PATTERN, flags=re.X) 18 | 19 | 20 | def curp(doc: str) -> Iterable[str]: 21 | """ 22 | Mexican Clave Única de Registro de Población (detect and validate) 23 | """ 24 | for candidate in _CURP_REGEX.findall(doc): 25 | if stdnum_curp.is_valid(candidate): 26 | yield candidate 27 | 28 | 29 | PII_TASKS = [(PiiEnum.GOV_ID, curp)] 30 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/fr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/fr/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/fr/ca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/fr/ca/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reuse the SIN code implemented for en 3 | """ 4 | from pii_manager.lang.en.ca.social_insurance_number import PII_TASKS 5 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/__init__.py: -------------------------------------------------------------------------------- 1 | # Folder for language-independent tasks 2 | LANG_ANY = "any" 3 | 4 | # Country-independent tasks 5 | COUNTRY_ANY = "any" 6 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/br/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/pt/br/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/br/cpf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection and validation of the identifier for Brazilian Cadastro de Pessoa 3 | Física 4 | 5 | It contains two check digits, so it can be validated. 6 | """ 7 | 8 | import re 9 | 10 | from stdnum.br import cpf 11 | 12 | from typing import Iterable 13 | 14 | from pii_manager import PiiEnum 15 | 16 | 17 | _CPF_REGEX = re.compile(r"\d{3} \. \d{3} \. \d{3} - \d{2}", flags=re.X) 18 | 19 | 20 | def cadastro_pessoa_fisica(doc: str) -> Iterable[str]: 21 | """ 22 | Brazilian número de inscrição no Cadastro de Pessoas Físicas (detect and validate) 23 | """ 24 | for candidate in _CPF_REGEX.findall(doc): 25 | if cpf.is_valid(candidate): 26 | yield candidate 27 | 28 | 29 | PII_TASKS = [(PiiEnum.GOV_ID, cadastro_pessoa_fisica)] 30 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/pt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/pt/pt/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/pt/pt/govid.py: -------------------------------------------------------------------------------- 1 | """ 2 | Portuguese Goverment-issued IDs: 3 | - NIF (Número de identificação fiscal) 4 | - CC (Número de Cartão de Cidadão) 5 | """ 6 | 7 | import re 8 | 9 | from typing import Iterable 10 | 11 | from stdnum.pt import nif, cc 12 | 13 | from pii_manager import PiiEnum, PiiEntity 14 | from pii_manager.helper import BasePiiTask 15 | 16 | 17 | # regex for NIF & CC 18 | _NIF_PATTERN = r"(?: PT \x20?)? (?: \d{3} \x20 \d{3} \x20 \d{3} | \d{9} )" 19 | _CC_PATTERN = r"\d{8} \x20? \d \x20? [A-Z0-9]{2}\d" 20 | 21 | 22 | class PortugueseNifCc(BasePiiTask): 23 | """ 24 | Portuguese Government-issued NIF & CC numbers, recognize & validate 25 | """ 26 | 27 | pii_name = "Portuguese NIF and CC numbers" 28 | 29 | def __init__(self, **kwargs): 30 | super().__init__(**kwargs) 31 | # Compile the regexes 32 | self.nif = re.compile(_NIF_PATTERN, flags=re.X) 33 | self.cc = re.compile(_CC_PATTERN, flags=re.X) 34 | 35 | def find(self, doc: str) -> Iterable[PiiEntity]: 36 | # NIF 37 | for item in self.nif.finditer(doc): 38 | item_value = item.group() 39 | if nif.is_valid(item_value): 40 | yield PiiEntity( 41 | PiiEnum.GOV_ID, 42 | item.start(), 43 | item_value, 44 | country=self.country, 45 | name="Portuguese NIF", 46 | ) 47 | # CC 48 | for item in self.cc.finditer(doc): 49 | item_value = item.group() 50 | if cc.is_valid(item_value): 51 | yield PiiEntity( 52 | PiiEnum.GOV_ID, 53 | item.start(), 54 | item_value, 55 | country=self.country, 56 | name="Portuguese CC", 57 | ) 58 | 59 | 60 | # Task descriptor 61 | PII_TASKS = [(PiiEnum.GOV_ID, PortugueseNifCc)] 62 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/zh/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/cn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/zh/cn/__init__.py -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/cn/gov_id.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of various government-issued IDs for China: 3 | - Resident Identification Card number (this can be validated) 4 | - Passport number (this cannot) 5 | """ 6 | 7 | import re 8 | from typing import Iterable 9 | 10 | from pii_manager import PiiEnum 11 | 12 | from stdnum.cn import ric 13 | 14 | 15 | # Detect candidates (separately) for RIC and passport-like numbers 16 | _GOV_ID_PATTERN = r"(? Iterable[str]: 23 | """ 24 | Chinese government-issued identifiers: 25 | - RIC (Resident Identification Card number), detect and validate 26 | - Passport number, detect only 27 | """ 28 | for g in _GOV_ID_REGEX.finditer(doc): 29 | if g.group(1) and ric.is_valid(g.group(1)): 30 | yield g.group(1) 31 | elif g.group(2): 32 | yield g.group(2) 33 | 34 | 35 | PII_TASKS = [(PiiEnum.GOV_ID, ric_or_passport)] 36 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/lang/zh/cn/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of various Chinese PII elements 3 | """ 4 | 5 | 6 | from pii_manager import PiiEnum 7 | 8 | 9 | _PATTERNS = { 10 | "STREET_ADDRESS": r"""(\p{Han}{1,4} (自治区|省))? 11 | \p{Han}{1,4} 12 | ((?" 32 | 33 | def __eq__(self, other): 34 | return ( 35 | self.elem == other.elem 36 | and self.pos == other.pos 37 | and self.value == other.value 38 | and self.country == other.country 39 | and self.name == other.name 40 | ) 41 | 42 | def to_json(self) -> Dict: 43 | """ 44 | Return the object data as a dict that can then be serialised as JSON 45 | """ 46 | return piientity_asdict(self) 47 | 48 | 49 | def piientity_asdict(pii: PiiEntity, name: bool = None, country: bool = None) -> Dict: 50 | """ 51 | Create a dictionary from a PiiEntity object 52 | :param country: add country information: always (True), never (False), 53 | only if defined (None) 54 | """ 55 | n = {"name": pii.name} if name or country is None and pii.name else {} 56 | d = {"type": pii.elem.name, **n, "value": pii.value, "pos": pii.pos} 57 | if country or country is None and pii.country: 58 | d["country"] = pii.country 59 | return d 60 | -------------------------------------------------------------------------------- /pii-manager/src/pii_manager/piienum.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enumeration that contains all defined PII elements 3 | 4 | Order is significant, in the sense that, on an processing job, tasks coming 5 | earlier in the enum will be tried first. Hence the more generic tasks (tasks 6 | that might collide with more specific ones) should come last 7 | """ 8 | 9 | from enum import Enum, auto 10 | 11 | 12 | class PiiEnum(str, Enum): 13 | CREDIT_CARD = auto() 14 | BITCOIN_ADDRESS = auto() 15 | IP_ADDRESS = auto() 16 | EMAIL_ADDRESS = auto() 17 | AGE = auto() 18 | BIRTH_DATE = auto() 19 | DEATH_DATE = auto() 20 | NORP = auto() 21 | DISEASE = auto() 22 | BANK_ACCOUNT = auto() 23 | GOV_ID = auto() 24 | PHONE_NUMBER = auto() 25 | LICENSE_PLATE = auto() 26 | STREET_ADDRESS = auto() 27 | -------------------------------------------------------------------------------- /pii-manager/test/data/extract-block.ndjson: -------------------------------------------------------------------------------- 1 | {"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25} 2 | {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86} 3 | -------------------------------------------------------------------------------- /pii-manager/test/data/extract-line.ndjson: -------------------------------------------------------------------------------- 1 | {"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25, "line": 1} 2 | {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 10, "line": 2} 3 | -------------------------------------------------------------------------------- /pii-manager/test/data/extract-sentence.ndjson: -------------------------------------------------------------------------------- 1 | {"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25, "sentence": 1} 2 | {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86, "sentence": 1} 3 | -------------------------------------------------------------------------------- /pii-manager/test/data/full-block.ndjson: -------------------------------------------------------------------------------- 1 | {"text": "My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs\nstored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. This one, however, is not a\ncredit card number: 4273 9666 4581 5643\n", "entities": [{"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}, {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86}]} 2 | -------------------------------------------------------------------------------- /pii-manager/test/data/full-line.ndjson: -------------------------------------------------------------------------------- 1 | {"text": "My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs\n", "entities": [{"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}]}{"text": "stored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. This one, however, is not a\n", "entities": [{"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 10}]}{"text": "credit card number: 4273 9666 4581 5643\n", "entities": []} 2 | -------------------------------------------------------------------------------- /pii-manager/test/data/full-sentence.ndjson: -------------------------------------------------------------------------------- 1 | {"text": "My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs\nstored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. ", "entities": [{"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}, {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86}]}{"text": "This one, however, is not a\ncredit card number: 4273 9666 4581 5643\n", "entities": []} 2 | -------------------------------------------------------------------------------- /pii-manager/test/data/orig.txt: -------------------------------------------------------------------------------- 1 | My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs 2 | stored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. This one, however, is not a 3 | credit card number: 4273 9666 4581 5643 4 | -------------------------------------------------------------------------------- /pii-manager/test/data/replace.txt: -------------------------------------------------------------------------------- 1 | My credit card number is and I have used it to buy BTCs 2 | stored at . This one, however, is not a 3 | credit card number: 4273 9666 4581 5643 4 | -------------------------------------------------------------------------------- /pii-manager/test/data/tag.txt: -------------------------------------------------------------------------------- 1 | My credit card number is and I have used it to buy BTCs 2 | stored at . This one, however, is not a 3 | credit card number: 4273 9666 4581 5643 4 | -------------------------------------------------------------------------------- /pii-manager/test/data/taskfile-error.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "pii": "IP_ADDRESS", 4 | "type": "regex", 5 | "task": "\\b (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \\. ){3} (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]?) \\b" 6 | }, 7 | { 8 | "pii": "NOT_A_VALID_PII_TASK_CLASS", 9 | "type": "call", 10 | "task": "pii_manager.lang.any.bitcoin_address.bitcoin_address" 11 | }, 12 | { 13 | "pii": "CREDIT_CARD", 14 | "type": "PiiClass", 15 | "task": "pii_manager.lang.any.credit_card.CreditCard" 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /pii-manager/test/data/taskfile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "pii": "IP_ADDRESS", 4 | "lang": "any", 5 | "type": "regex", 6 | "task": "\\b (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \\. ){3} (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]?) \\b", 7 | "doc": "ip address detection via regex" 8 | }, 9 | { 10 | "pii": "BITCOIN_ADDRESS", 11 | "lang": "any", 12 | "type": "callable", 13 | "task": "pii_manager.lang.any.bitcoin_address.bitcoin_address", 14 | "doc": "bitcoin address detection" 15 | }, 16 | { 17 | "pii": "CREDIT_CARD", 18 | "lang": "en", 19 | "type": "PiiTask", 20 | "task": "pii_manager.lang.any.credit_card.CreditCard", 21 | "doc": "credit card number detection" 22 | } 23 | ] 24 | -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_manager.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | 3 | from pii_manager import PiiEnum 4 | from pii_manager.api import PiiManager 5 | 6 | 7 | TEST = ( 8 | "El número de la tarjeta de crédito es 4273 9666 4581 5642", 9 | "El número de la tarjeta de crédito es ", 10 | ) 11 | 12 | 13 | def test10_constructor(): 14 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD) 15 | assert obj.tasks[0].pii == PiiEnum.CREDIT_CARD 16 | assert str(obj) == "" 17 | 18 | 19 | def test20_info(): 20 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD) 21 | info = obj.task_info() 22 | 23 | exp = { 24 | (PiiEnum.CREDIT_CARD, None,): [ 25 | ( 26 | "credit card", 27 | "Credit card numbers for most international credit cards (detect & validate)", 28 | ) 29 | ] 30 | } 31 | assert info == exp 32 | 33 | 34 | def test20_call(): 35 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD) 36 | anon = obj(TEST[0]) 37 | assert anon == TEST[1] 38 | -------------------------------------------------------------------------------- /pii-manager/test/unit/api/test_manager_ctx.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test base objects with context 3 | """ 4 | 5 | from pii_manager import PiiEnum, PiiEntity 6 | from pii_manager.api import PiiManager 7 | 8 | 9 | def _pii(pos): 10 | return PiiEntity(PiiEnum.GOV_ID, pos, "3451-K", country="vo", name="vogonian ID") 11 | 12 | 13 | TEST = [ 14 | ("my Vogon ID is 3451-K", [_pii(15)]), 15 | ("the number 3451-K is my Vogonian ID", [_pii(11)]), 16 | ("the Vogon ID are 3451-K", []), # context outside window 17 | ("my Betelgeuse ID is 3451-K", []), # context does not match 18 | ] 19 | 20 | 21 | # ------------------------------------------------------------------------ 22 | 23 | DUMMY_REGEX = { 24 | "pii": PiiEnum.GOV_ID, 25 | "type": "regex", 26 | "task": r"""\b\d{4}-\w\b""", 27 | "lang": "en", 28 | "name": "vogonian ID", 29 | "country": "vo", 30 | "doc": "a toy example to match a government id", 31 | "context": {"value": ["Vogon ID", "vogonian id"], "width": [12, 20]}, 32 | } 33 | 34 | 35 | def test10_context_regex(): 36 | """ 37 | Check a PII task with contexts, regex variant 38 | """ 39 | obj = PiiManager("en", mode="extract") 40 | obj.add_tasks([DUMMY_REGEX]) 41 | for (text, exp) in TEST: 42 | got = obj(text) 43 | assert list(got) == exp 44 | 45 | 46 | # ------------------------------------------------------------------------ 47 | 48 | 49 | DUMMY_CLASS = { 50 | "pii": PiiEnum.GOV_ID, 51 | "type": "PiiTask", 52 | "task": "unit.api.test_manager_add.DummyPii", 53 | "lang": "en", 54 | "country": "vo", 55 | "name": "vogonian ID", 56 | "doc": "a toy example to match a government id", 57 | "context": {"value": ["Vogon ID", "vogonian id"], "width": [12, 20]}, 58 | } 59 | 60 | 61 | def test20_context_class(): 62 | """ 63 | Check a PII task with contexts, class variant 64 | """ 65 | obj = PiiManager("en", mode="extract") 66 | obj.add_tasks([DUMMY_CLASS]) 67 | for (text, exp) in TEST: 68 | got = obj(text) 69 | assert list(got) == exp 70 | -------------------------------------------------------------------------------- /pii-manager/test/unit/helper/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pii_manager import PiiEnum, PiiEntity 4 | from pii_manager.helper.base import BasePiiTask 5 | from pii_manager.helper.exception import PiiUnimplemented, InvArgException 6 | 7 | import pii_manager.helper.base as mod 8 | 9 | 10 | def test10_base(): 11 | """ 12 | Create base object 13 | """ 14 | task_spec = {"pii": PiiEnum.BITCOIN_ADDRESS, "lang": "es", "name": "example"} 15 | task = mod.BasePiiTask(**task_spec) 16 | assert task.pii == PiiEnum.BITCOIN_ADDRESS 17 | assert task.lang == "es" 18 | assert task.name == "example" 19 | 20 | with pytest.raises(PiiUnimplemented): 21 | task("blah") 22 | 23 | 24 | def test20_regex(): 25 | """ 26 | Test regex object 27 | """ 28 | task_spec = {"pii": PiiEnum.CREDIT_CARD, "lang": "es", "name": "example"} 29 | task = mod.RegexPiiTask(r"\d{4}", **task_spec) 30 | 31 | got = list(task("number 1234 and number 3451")) 32 | exp = [ 33 | PiiEntity(PiiEnum.CREDIT_CARD, 7, "1234", name="example"), 34 | PiiEntity(PiiEnum.CREDIT_CARD, 23, "3451", name="example"), 35 | ] 36 | assert exp == got 37 | 38 | 39 | def test30_callable(): 40 | """ 41 | Test callable object 42 | """ 43 | 44 | def example(i: str): 45 | return ["1234", "3451"] 46 | 47 | task_spec = {"pii": PiiEnum.CREDIT_CARD, "lang": "es", "name": "example"} 48 | task = mod.CallablePiiTask(example, **task_spec) 49 | 50 | got = list(task("number 1234 and number 3451")) 51 | exp = [ 52 | PiiEntity(PiiEnum.CREDIT_CARD, 7, "1234", name="example"), 53 | PiiEntity(PiiEnum.CREDIT_CARD, 23, "3451", name="example"), 54 | ] 55 | assert exp == got 56 | -------------------------------------------------------------------------------- /pii-manager/test/unit/helper/test_norm.py: -------------------------------------------------------------------------------- 1 | import pii_manager.helper.normalizer as mod 2 | 3 | 4 | TEST = [("the Social Security\nNumber is 34512", "the social security number is 34512")] 5 | 6 | 7 | def test10_normalizer(): 8 | """ 9 | Create base object 10 | """ 11 | for (text, exp) in TEST: 12 | assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp 13 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_bitcoin_address.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test bitcoin addresses 3 | """ 4 | 5 | 6 | from pii_manager import PiiEnum 7 | from pii_manager.api import PiiManager 8 | 9 | 10 | TEST = [ 11 | # A valid bitcoin address 12 | ( 13 | "BTC address: 1JayVxfVgdaFKirkZTZVK4CdRnFDdFNENN", 14 | "BTC address: ", 15 | ), 16 | ( 17 | "BTC address: bc1qwxxvjxlakxe9rmxcphh4yy8a2t6z00k4gc4mpj", 18 | "BTC address: ", 19 | ), 20 | # An invalid bitcoin address 21 | ( 22 | "BTC address: 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW623", 23 | "BTC address: 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW623", 24 | ), 25 | ] 26 | 27 | 28 | def test10_credit_card(): 29 | obj = PiiManager("en", None, PiiEnum.BITCOIN_ADDRESS) 30 | for doc, exp in TEST: 31 | got = obj(doc) 32 | assert got == exp 33 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_credit_card.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test credit card numbers 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | 9 | TEST = [ 10 | # A valid credit card number 11 | ( 12 | "El número de la tarjeta de crédito es 4273 9666 4581 5642", 13 | "El número de la tarjeta de crédito es ", 14 | ), 15 | # Without spaces 16 | ("La tarjeta es 4273966645815642", "La tarjeta es "), 17 | # With text afterwards 18 | ( 19 | "El número de la tarjeta es 4273 9666 4581 5642 probablemente", 20 | "El número de la tarjeta es probablemente", 21 | ), 22 | # With dashes 23 | ( 24 | "mi tarjeta es 4273-9666-4581-5642 con caducidad 07/22", 25 | "mi tarjeta es con caducidad 07/22", 26 | ), 27 | # Too short 28 | ( 29 | "El número de la tarjeta de crédito es 4273 9666 4581", 30 | "El número de la tarjeta de crédito es 4273 9666 4581", 31 | ), 32 | # Not a valid credit card number 33 | ( 34 | "El número de la tarjeta de crédito es 4273 9666 4581 5641", 35 | "El número de la tarjeta de crédito es 4273 9666 4581 5641", 36 | ), 37 | ] 38 | 39 | 40 | def test10_credit_card(): 41 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD) 42 | for doc, exp in TEST: 43 | got = obj(doc) 44 | assert exp == got 45 | 46 | 47 | def test20_credit_card_stats(): 48 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD) 49 | for doc, exp in TEST: 50 | obj(doc) 51 | assert obj.stats == {"calls": 6, "CREDIT_CARD": 4} 52 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_email.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test email addersses 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | 9 | TEST = [ 10 | # A valid email address 11 | ( 12 | "My email is anyone@whatever.com.", 13 | "My email is .", 14 | ), 15 | # An invalid email address 16 | ( 17 | "My email is anyone@whatever.", 18 | "My email is anyone@whatever.", 19 | ), 20 | ] 21 | 22 | 23 | def test10_credit_card(): 24 | obj = PiiManager("es", None, PiiEnum.EMAIL_ADDRESS) 25 | for doc, exp in TEST: 26 | got = obj(doc) 27 | assert exp == got 28 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/any/test_ip_address.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test IP addresses 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | 9 | TEST = [ 10 | # A valid IP address 11 | ( 12 | "My IP address is 10.45.122.65", 13 | "My IP address is ", 14 | ), 15 | # An invalid IP address 16 | ("My IP address is 310.45.122.65", "My IP address is 310.45.122.65"), 17 | # An IP address without context 18 | ("My address is 10.45.122.65", "My address is 10.45.122.65"), 19 | ] 20 | 21 | 22 | def test10_ip_address(): 23 | obj = PiiManager("en", None, PiiEnum.IP_ADDRESS) 24 | for doc, exp in TEST: 25 | got = obj(doc) 26 | assert exp == got 27 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/any/test_ipn_en.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test international phone numbers 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | from pii_manager.lang import LANG_ANY 8 | 9 | TEST = [ 10 | # Standard phone number 11 | ("phone number: +34 983 453 999", "phone number: "), 12 | ("phone number: +34983453999", "phone number: "), 13 | ("ph. +34983453999", "ph. "), 14 | # An invalid country code 15 | ("phone number: +99 983 453 999", "phone number: +99 983 453 999"), 16 | # No valid contexts 17 | ("number: +34983453999", "number: +34983453999"), 18 | ("phonograph +34983453999", "phonograph +34983453999"), 19 | ] 20 | 21 | 22 | def test10_ssn(): 23 | obj = PiiManager("en", LANG_ANY, PiiEnum.PHONE_NUMBER) 24 | for doc, exp in TEST: 25 | got = obj(doc) 26 | assert got == exp 27 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/au/test_abn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Australian Business Number 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid ABN 10 | ("business number: 83 914 571 673.", "business number: ."), 11 | # ABN without spaces 12 | ("business number: 83914571673.", "business number: ."), 13 | # An invalid ABN 14 | ("not an ABN: 83 914 571 679", "not an ABN: 83 914 571 679"), 15 | ] 16 | 17 | 18 | def test10_abn(): 19 | obj = PiiManager("en", "AU", PiiEnum.GOV_ID) 20 | for doc, exp in TEST: 21 | got = obj(doc) 22 | assert got == exp 23 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/au/test_tfn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Australian Tax File Number 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid ABN 10 | ("tax file number: 963 553 151.", "tax file number: ."), 11 | ("the tfn is: 123 456 782", "the tfn is: "), 12 | # TFN without spaces 13 | ("tax file number: 963553151.", "tax file number: ."), 14 | # An invalid TFN 15 | ("not a TFN: 123 456 781", "not a TFN: 123 456 781"), 16 | ] 17 | 18 | 19 | def test10_abn(): 20 | obj = PiiManager("en", "AU", PiiEnum.GOV_ID) 21 | for doc, exp in TEST: 22 | got = obj(doc) 23 | assert got == exp 24 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/ca/test_sin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Canadian Social Insurance Number 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid SIN 10 | ("SIN: 963-553-151", "SIN: "), 11 | # SIN with spaces 12 | ("SIN: 339 892 317 number", "SIN: number"), 13 | # An invalid SIN 14 | ("not a SIN: 123-456-781", "not a SIN: 123-456-781"), 15 | ] 16 | 17 | 18 | def test10_ssn(): 19 | obj = PiiManager("en", "CA", PiiEnum.GOV_ID) 20 | for doc, exp in TEST: 21 | got = obj(doc) 22 | assert got == exp 23 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/in_/test_aadhaar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Indian Aadhaar Number 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid aadhaar 10 | ("aadhaar number 234123412346", "aadhaar number "), 11 | # aadhaar with spaces 12 | ("aadhaar number 2341 2341 2346", "aadhaar number "), 13 | # An invalid aadhaar 14 | ( 15 | "not a real aadhaar number: 2341 2341 2347", 16 | "not a real aadhaar number: 2341 2341 2347", 17 | ), 18 | ] 19 | 20 | 21 | def test10_ssn(): 22 | obj = PiiManager("en", "IN", PiiEnum.GOV_ID) 23 | for doc, exp in TEST: 24 | got = obj(doc) 25 | assert got == exp 26 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/en/us/test_ssn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test US Social Security Number 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid SSN 10 | ("SSN: 536-90-4399", "SSN: "), 11 | # SSN with spaces 12 | ("SSN: 536 90 4399", "SSN: "), 13 | # An invalid SSN 14 | ("not a SSN: 666-90-4399", "not a SSN: 666-90-4399"), 15 | ] 16 | 17 | 18 | def test10_ssn(): 19 | obj = PiiManager("en", "US", PiiEnum.GOV_ID) 20 | for doc, exp in TEST: 21 | got = obj(doc) 22 | assert got == exp 23 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/any/test_ipn_es.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test international phone numbers 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | from pii_manager.lang import LANG_ANY 8 | 9 | TEST = [ 10 | # Standard phone number 11 | ("teléfono: +34 983 453 999", "teléfono: "), 12 | ("tf. +34983453999", "tf. "), 13 | ("numero de telefono +34983453999", "numero de telefono "), 14 | # An invalid country code 15 | ("teléfono: +99 983 453 999", "teléfono: +99 983 453 999"), 16 | # No valid contexts 17 | ("número: +34983453999", "número: +34983453999"), 18 | ("tff +34983453999", "tff +34983453999"), 19 | ] 20 | 21 | 22 | def test10_ssn(): 23 | obj = PiiManager("es", LANG_ANY, PiiEnum.PHONE_NUMBER) 24 | for doc, exp in TEST: 25 | got = obj(doc) 26 | assert got == exp 27 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/es/test_bank_account.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Spanish Bank Accounts 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid bank account number 10 | ( 11 | "Código cuenta cliente: 2085 8720 60 1902070563", 12 | "Código cuenta cliente: ", 13 | ), 14 | # No spaces 15 | ( 16 | "Código cuenta cliente: 20858720601902070563", 17 | "Código cuenta cliente: ", 18 | ), 19 | # An invalid bank account number 20 | ( 21 | "Código cuenta cliente: 2085 8720 44 1902070563", 22 | "Código cuenta cliente: 2085 8720 44 1902070563", 23 | ), 24 | ] 25 | 26 | 27 | def test10_bank_account(): 28 | obj = PiiManager("es", "ES", PiiEnum.BANK_ACCOUNT) 29 | for doc, exp in TEST: 30 | got = obj(doc) 31 | assert got == exp 32 | 33 | 34 | def test20_bank_account_undefined(): 35 | """ 36 | Test under another country (hence it will NOT be defined) 37 | """ 38 | obj = PiiManager("es", "FR", PiiEnum.BANK_ACCOUNT) 39 | for doc, exp in TEST: 40 | got = obj(doc) 41 | assert got == doc 42 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/es/test_govid_es_es.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Spanish DNI & NIE 3 | """ 4 | 5 | from pii_manager import PiiEnum, PiiEntity 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid DNI 10 | ( 11 | "Mi DNI es 34657934-Q", 12 | "Mi DNI es ", 13 | [PiiEntity(PiiEnum.GOV_ID, 10, "34657934-Q", "es", "Spanish DNI")], 14 | ), 15 | # A DNI without dash 16 | ( 17 | "El DNI 34657934Q es válido", 18 | "El DNI es válido", 19 | [PiiEntity(PiiEnum.GOV_ID, 7, "34657934Q", "es", "Spanish DNI")], 20 | ), 21 | # A valid NIE 22 | ( 23 | "El NIE es X3465793-S", 24 | "El NIE es ", 25 | [PiiEntity(PiiEnum.GOV_ID, 10, "X3465793-S", "es", "Spanish NIE")], 26 | ), 27 | # An invalid DNI 28 | ("Mi DNI es 34657934-H", "Mi DNI es 34657934-H", []), 29 | ] 30 | 31 | 32 | def test10_dni(): 33 | obj = PiiManager("es", "ES", PiiEnum.GOV_ID) 34 | for doc, exp, _ in TEST: 35 | got = obj(doc) 36 | assert got == exp 37 | 38 | 39 | def test20_dni_extract(): 40 | obj = PiiManager("es", "ES", PiiEnum.GOV_ID, mode="extract") 41 | for doc, _, exp in TEST: 42 | got = list(obj(doc)) 43 | assert got == exp 44 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/es/mx/test_govid_es_mx.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Mexican CURP 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid CURP 10 | ("Mi número de CURP es PEPP700101HASRRD09", "Mi número de CURP es "), 11 | # An invalid CURP 12 | ( 13 | "Mi número de CURP es PEPP700101HASRRD01", 14 | "Mi número de CURP es PEPP700101HASRRD01", 15 | ), 16 | ] 17 | 18 | 19 | def test10_curp(): 20 | obj = PiiManager("es", "MX", PiiEnum.GOV_ID) 21 | for doc, exp in TEST: 22 | got = obj(doc) 23 | assert got == exp 24 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/pt/br/test_govid_pt_br.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Brazilian CPF 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid CPF 10 | ("O número do CPF é 263.946.533-30", "O número do CPF é "), 11 | # An invalid CPF 12 | ("O número do CPF é 000.000.000-12", "O número do CPF é 000.000.000-12"), 13 | ] 14 | 15 | 16 | def test10_cpf(): 17 | obj = PiiManager("pt", "BR", PiiEnum.GOV_ID) 18 | for doc, exp in TEST: 19 | got = obj(doc) 20 | assert got == exp 21 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/pt/pt/test_govid_pt_pt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Portuguese NIF & CC 3 | """ 4 | 5 | from pii_manager import PiiEnum, PiiEntity 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid NIF 10 | ( 11 | "Meu NIF é PT 123 456 789", 12 | "Meu NIF é ", 13 | [PiiEntity(PiiEnum.GOV_ID, 10, "PT 123 456 789", "pt", "Portuguese NIF")], 14 | ), 15 | # A NIF without spacing or prefix 16 | ( 17 | "O NIF 123456789 é valido", 18 | "O NIF é valido", 19 | [PiiEntity(PiiEnum.GOV_ID, 6, "123456789", "pt", "Portuguese NIF")], 20 | ), 21 | # A valid CC 22 | ( 23 | "O CC é 00000000 0 ZZ4", 24 | "O CC é ", 25 | [PiiEntity(PiiEnum.GOV_ID, 7, "00000000 0 ZZ4", "pt", "Portuguese CC")], 26 | ), 27 | # An invalid NIF 28 | ("Meu NIF é PT 123 456 788", "Meu NIF é PT 123 456 788", []), 29 | ] 30 | 31 | 32 | def test10_nif_cc(): 33 | obj = PiiManager("pt", "PT", PiiEnum.GOV_ID) 34 | for doc, exp, _ in TEST: 35 | got = obj(doc) 36 | assert got == exp 37 | 38 | 39 | def test20_nif_cc_extract(): 40 | obj = PiiManager("pt", "PT", PiiEnum.GOV_ID, mode="extract") 41 | for doc, _, exp in TEST: 42 | got = list(obj(doc)) 43 | assert got == exp 44 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/zh/cn/test_govid_zh_cn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test Chinese government ids (Resident Identity Card & Passport) 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # A valid RIC 10 | ("公民身份号码 360426199101010071", "公民身份号码 "), 11 | # An invalid RIC 12 | ("公民身份号码 360426199101010072", "公民身份号码 360426199101010072"), 13 | # An invalid RIC (one aditional digit) 14 | ("公民身份号码 3604261991010100717", "公民身份号码 3604261991010100717"), 15 | # A correct passport number 16 | ("中华人民共和国护照 D12345678", "中华人民共和国护照 "), 17 | # An incorrect passport number (invalid letter) 18 | ("中华人民共和国护照 K12345678", "中华人民共和国护照 K12345678"), 19 | # An incorrect passport number (only 7 digits) 20 | ("中华人民共和国护照 D1234567", "中华人民共和国护照 D1234567"), 21 | ] 22 | 23 | 24 | def test10_ssn(): 25 | obj = PiiManager("zh", "CN", PiiEnum.GOV_ID) 26 | for doc, exp in TEST: 27 | got = obj(doc) 28 | assert got == exp 29 | -------------------------------------------------------------------------------- /pii-manager/test/unit/lang/zh/cn/test_misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test PII elements for Chinese (Phone numbers, street addresses & diseases) 3 | """ 4 | 5 | from pii_manager import PiiEnum 6 | from pii_manager.api import PiiManager 7 | 8 | TEST = [ 9 | # Phone number 10 | ("045-4123456", ""), 11 | # Not a phone number (too many digits in the first part) 12 | ("70045-4123456", "70045-4123456"), 13 | # ----- We are missing here tests for STREET_ADDRESS & DISEASE 14 | ] 15 | 16 | 17 | def test10_ssn(): 18 | obj = PiiManager( 19 | "zh", "CN", [PiiEnum.STREET_ADDRESS, PiiEnum.PHONE_NUMBER, PiiEnum.DISEASE] 20 | ) 21 | for doc, exp in TEST: 22 | got = obj(doc) 23 | assert got == exp 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool] 2 | [tool.poetry] 3 | name = "data-tooling" 4 | version = "0.1.0" 5 | description = "Tools for managing datasets for governance and training." 6 | authors = ["BigScience "] 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.7.10" 10 | 11 | datasets = "^1.12.1" 12 | transformers = "^4.12.3" 13 | nltk = "^3.6.5" 14 | scikit-learn = "^1.0.1" 15 | fsspec = "^2021.11.0" 16 | kenlm = {url = "https://github.com/kpu/kenlm/archive/master.zip", optional = true} 17 | typer = "^0.4.0" 18 | regex = "^2021.11.10" 19 | simhash-py = "^0.4.0" 20 | PyYAML = "^6.0" 21 | tqdm = "^4.62.3" 22 | 23 | [tool.poetry.dev-dependencies] 24 | pdbpp = "^0.10.2" 25 | isort = "^5.6.4" 26 | flake8 = "^3.8.4" 27 | black = "^21.7b0" 28 | pytest = "^6.2.4" 29 | jupyterlab = "^3.0.16" 30 | 31 | [tool.poetry.extras] 32 | kenlm = ["kenlm"] 33 | 34 | [tool.isort] 35 | profile = 'black' 36 | treat_comments_as_code = "# %%" 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dataset>=1.5.0 2 | datasets>=1.8.0 3 | fasttext>=0.9.2 4 | fsspec 5 | ftfy 6 | indexed_gzip>=1.6.1 7 | indexed_gzip>=1.6.1 8 | langid>=1.1.6 9 | nltk 10 | scikit-learn 11 | sentencepiece 12 | sqlalchemy>=1.4.20 13 | transformers 14 | wordfreq 15 | -------------------------------------------------------------------------------- /tokenizer/python_script/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets>=1.18.0 2 | pyarrow>=6.0.0 3 | -------------------------------------------------------------------------------- /tokenizer/scripts/01_remove_deplicated_lines.sh: -------------------------------------------------------------------------------- 1 | conda activate dedup-dataset 2 | 3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling 4 | 5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha-subset-12M 6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha-subset-12M-dedup-lines 7 | 8 | pushd $DATA_TOOLING_REPO 9 | 10 | export HF_DATASETS_OFFLINE=1 11 | export HF_DATASETS_CACHE=/home/lucile/to_delete 12 | 13 | python tokenizer/python_script/dedup_lines.py \ 14 | --save-dir $SAVE_DATASET_DIR \ 15 | --dataset_dir $DATASET_PATH \ 16 | --batch-size 100 \ 17 | --num-proc 3 \ 18 | --min-chars 0 \ 19 | --n-records 1000000 \ 20 | --min-repetition-threshold 0 21 | -------------------------------------------------------------------------------- /tokenizer/scripts/02_remove_duplicated_lines_dataset_with_dataset_source.sh: -------------------------------------------------------------------------------- 1 | conda activate dedup-dataset 2 | 3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling 4 | 5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/tokenization_dataset_v3_small_arrow 6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/tokenization_dataset_v3_small_arrow-dedup 7 | 8 | pushd $DATA_TOOLING_REPO 9 | 10 | export HF_DATASETS_OFFLINE=1 11 | export HF_DATASETS_CACHE=/home/lucile/to_delete 12 | 13 | python tokenizer/python_script/dedup_lines.py \ 14 | --save-dir $SAVE_DATASET_DIR \ 15 | --dataset_dir $DATASET_PATH \ 16 | --batch-size 100 \ 17 | --num-proc 3 \ 18 | --min-chars 0 \ 19 | --n-records 1000 \ 20 | --min-repetition-threshold 0 \ 21 | --preserve_code \ 22 | --with-meta-col 23 | -------------------------------------------------------------------------------- /tokenizer/scripts/03_remove_duplicated_lines_alpha.sh: -------------------------------------------------------------------------------- 1 | conda activate dedup-dataset 2 | 3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling 4 | 5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_arrow 6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_arrow-dedup 7 | 8 | pushd $DATA_TOOLING_REPO 9 | 10 | export HF_DATASETS_OFFLINE=1 11 | export HF_DATASETS_CACHE=/home/lucile/to_delete 12 | 13 | python tokenizer/python_script/dedup_lines.py \ 14 | --save-dir $SAVE_DATASET_DIR \ 15 | --dataset_dir $DATASET_PATH \ 16 | --batch-size 1000 \ 17 | --num-proc 1 \ 18 | --min-chars 0 \ 19 | --n-records 57290988 \ 20 | --min-repetition-threshold 0 \ 21 | --preserve_code \ 22 | --with-meta-col 23 | -------------------------------------------------------------------------------- /tokenizer/scripts/04_remove_duplicated_lines_alpha _memory.sh: -------------------------------------------------------------------------------- 1 | conda activate dedup-dataset 2 | 3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling 4 | 5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_arrow 6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_arrow-dedup 7 | 8 | pushd $DATA_TOOLING_REPO 9 | 10 | export HF_DATASETS_OFFLINE=1 11 | export HF_DATASETS_CACHE=/home/lucile/to_delete 12 | 13 | python tokenizer/python_script/ram_dedup_lines.py \ 14 | --save-dir $SAVE_DATASET_DIR \ 15 | --dataset_dir $DATASET_PATH \ 16 | --num-proc 1 \ 17 | --batch-size 6000000 \ 18 | --load-from-disk 19 | -------------------------------------------------------------------------------- /tokenizer/scripts/05_remove_duplicated_lines_alpha __v2_memory.sh: -------------------------------------------------------------------------------- 1 | conda activate dedup-dataset 2 | 3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling 4 | 5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_v2 6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_v2_dedup 7 | 8 | pushd $DATA_TOOLING_REPO 9 | 10 | export HF_DATASETS_OFFLINE=1 11 | export HF_DATASETS_CACHE=/home/lucile/to_delete 12 | 13 | python tokenizer/python_script/ram_dedup_lines.py \ 14 | --save-dir $SAVE_DATASET_DIR \ 15 | --dataset_dir $DATASET_PATH \ 16 | --num-proc 1 \ 17 | --batch-size 6000000 18 | -------------------------------------------------------------------------------- /tokenizer/scripts/06_dedup_exact_examples.sh: -------------------------------------------------------------------------------- 1 | conda activate dedup-dataset 2 | 3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling 4 | 5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_v2_dedup 6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_v2_dedup_lines_and_article 7 | 8 | pushd $DATA_TOOLING_REPO 9 | 10 | export HF_DATASETS_OFFLINE=1 11 | export HF_DATASETS_CACHE=/home/lucile/to_delete 12 | 13 | python tokenizer/python_script/dedup_exact_article.py \ 14 | --save-dir $SAVE_DATASET_DIR \ 15 | --dataset_dir $DATASET_PATH \ 16 | --num-proc 8 \ 17 | --batch-size 6000000 18 | --------------------------------------------------------------------------------