├── .github
└── workflows
│ ├── add-issue-to-project.yml
│ ├── label-with-contact-neede.yml
│ ├── label-with-help-wanted.yml
│ ├── pii-manager.yml
│ └── self-assign.yaml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── LICENSE
├── Makefile
├── README.md
├── __init__.py
├── ac_dc
├── README.md
├── anonymization.py
├── deduplicate
│ ├── README.md
│ ├── conf
│ │ ├── self_deduplicate_ar.yaml
│ │ ├── self_deduplicate_bn.yaml
│ │ ├── self_deduplicate_ca.yaml
│ │ ├── self_deduplicate_en.yaml
│ │ ├── self_deduplicate_es.yaml
│ │ ├── self_deduplicate_eu.yaml
│ │ ├── self_deduplicate_fr.yaml
│ │ ├── self_deduplicate_gl.yaml
│ │ ├── self_deduplicate_hi.yaml
│ │ ├── self_deduplicate_id.yaml
│ │ ├── self_deduplicate_pt.yaml
│ │ ├── self_deduplicate_ur.yaml
│ │ ├── self_deduplicate_vi.yaml
│ │ └── self_deduplicate_zh.yaml
│ ├── deduplicate
│ │ ├── __init__.py
│ │ └── util.py
│ ├── self_deduplicate.py
│ └── visualize.ipynb
├── download_sentencepiece_kenlm_models.py
├── explanation_filtering_pipeline.pdf
├── filtering.py
├── flagged_words.py
├── languages_id.py
├── main_filtering.py
├── normalization.py
├── parameters_filtering.py
├── person_and_id_anonymization.py
├── stopwords.py
├── test_anonymization.py
└── visualization
│ ├── README.md
│ ├── get_data_for_visualization.py
│ └── visualization.py
├── bertin
├── README.md
├── config.json
├── config.py
├── configs
│ ├── base
│ │ ├── config.json
│ │ └── tokenizer.json
│ └── large
│ │ ├── config.json
│ │ └── tokenizer.json
├── convert.py
├── evaluation
│ ├── paws.yaml
│ ├── run_glue.py
│ ├── run_ner.ipynb
│ ├── run_ner.py
│ ├── token.yaml
│ └── xnli.yaml
├── events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2
├── events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2
├── events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2
├── get_embeddings_and_perplexity.py
├── images
│ ├── bertin-tilt.png
│ ├── bertin.png
│ ├── ccnet.png
│ ├── datasets-perp-20-120.png
│ ├── datasets-perp.png
│ ├── datasets-random-comparison.png
│ ├── datasets-wsize.png
│ ├── perp-p95.png
│ ├── perp-resample-gaussian.png
│ ├── perp-resample-stepwise.png
│ ├── perplexity_colored_embeddings.html
│ └── random_512.jpg
├── mc4
│ ├── README.md
│ ├── dummy
│ │ └── af
│ │ │ └── 0.0.0
│ │ │ └── dummy_data.zip
│ └── mc4.py
├── merges.txt
├── perplexity.py
├── run.sh
├── run_mlm_flax.py
├── run_mlm_flax_stream.py
├── run_stream.sh
├── special_tokens_map.json
├── tokenizer.json
├── tokenizer_config.json
├── tokens.py
├── tokens.py.orig
├── tsne_plot.py
├── utils
│ ├── dataset_perplexity.py
│ ├── download_mc4es_sampled.py
│ └── generate_datasets.py
└── vocab.json
├── cc_pseudo_crawl
├── get_stats.py
├── language_annotation
│ ├── python_scripts
│ │ ├── annotate_langid_crawl.py
│ │ ├── check_wrong_files.py
│ │ ├── compute_stats_langid.py
│ │ └── detect_html_lang_attrib.py
│ └── slurm_scripts
│ │ ├── 02_detect_html_lang_attrib.slurm
│ │ └── job_annotate_langid_crawl.sh
├── processing_notebooks
│ ├── NigerCongoDS.ipynb
│ └── pseudocrawl_nigercongo.ipynb
├── python_scripts
│ ├── cc_lookup_next.py
│ ├── cc_lookup_seed.py
│ ├── check_erros_in_dataset.py
│ ├── deeper.py
│ ├── divide_in_shards.py
│ ├── download_warc.py
│ ├── exact_deduplicates.py
│ ├── extract_text
│ │ ├── extract_text_and_html_metadata.py
│ │ └── requirements.txt
│ ├── finalise.py
│ ├── load_all_seed_ids.py
│ ├── merge_seed_shards.py
│ ├── preprocess_dataset.py
│ ├── process_for_concatenation.py
│ ├── pseudo_crawl_seed_to_lm_dset.py
│ ├── pseudo_crawl_seed_to_lm_dset_v2.py
│ ├── redownload_warc.py
│ ├── requirements.txt
│ ├── shard_and_compress.py
│ └── shard_by_seed_id.py
├── seeds_batch_1
│ ├── .gitignore
│ ├── DEPTH.md
│ ├── README.md
│ ├── slurm_scripts
│ │ ├── check_errors_in_dataset.slurm
│ │ ├── divide_in_subshards.slurm
│ │ ├── divide_in_subshards_1000.slurm
│ │ ├── download_warc.slurm
│ │ ├── download_warc_too_big.slurm
│ │ ├── download_warc_trial_4.slurm
│ │ ├── download_warc_trial_5.slurm
│ │ ├── extract_text_and_html_metadata.slurm
│ │ ├── merge_seed_shards.slurm
│ │ ├── preprocess_warc.slurm
│ │ ├── redownload_warc.slurm
│ │ ├── shard_and_compress.slurm
│ │ └── shard_by_seed_id.slurm
│ └── sourcing_sheet_seeds
│ │ ├── README.md
│ │ ├── candidate_websites_for_crawling.csv
│ │ ├── cc-metrics.csv
│ │ ├── cc-metrics.ipynb
│ │ ├── cleanup-seeds.ipynb
│ │ ├── filtered_catalogue.json
│ │ ├── preprocess_dataset.ipynb
│ │ ├── seeds.csv
│ │ └── test_preprcessing_via_pyarrow_pandas.ipynb
├── seeds_batch_1_2
│ ├── 00_clean_dataset.slurm
│ └── 01_exact_deduplicates.slurm
└── seeds_batch_2
│ ├── .gitignore
│ ├── README.md
│ ├── slurm_scripts
│ ├── 01_download_warc.slurm
│ ├── 02_redownload_warc.slurm
│ ├── 02b_redownload_warc.slurm
│ ├── 03_check_errors_in_dataset.slurm
│ ├── 04_divide_in_subshards.slurm
│ ├── 05_preprocess_warc.slurm
│ ├── 06_extract_text_and_html_metadata.slurm
│ ├── 07_shard_by_seed_id.slurm
│ ├── 08_merge_seed_shards.slurm
│ ├── 09_shard_and_compress.slurm
│ └── 10_push_to_hub.slurm
│ └── sourcing_sheet_seeds
│ ├── cleanup-seeds.ipynb
│ ├── seeds.csv
│ ├── seeds_batch_2.csv
│ └── seeds_batch_2.json
├── index_search
├── README.md
├── datasets_ES_builder.py
├── datasets_ES_index.py
├── datasets_ES_search.py
├── datasets_remote_ES_IBMcloud.py
├── docker-compose.yml
└── requirements.txt
├── kenlm_training
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── cc_net
│ ├── __init__.py
│ ├── __main__.py
│ ├── data
│ │ ├── cutoff.csv
│ │ └── test_stats.json
│ ├── dedup.py
│ ├── execution.py
│ ├── flat_hash_set.py
│ ├── get_hf_dataset.py
│ ├── get_wiki_cirrus.py
│ ├── jsonql.py
│ ├── mine.py
│ ├── minify.py
│ ├── perplexity.py
│ ├── process_wet_file.py
│ ├── regroup.py
│ ├── split_by_lang.py
│ ├── text_normalizer.py
│ ├── tokenizer.py
│ └── tools
│ │ ├── __init__.py
│ │ ├── dl_cc_100.py
│ │ ├── expand_corpus.py
│ │ └── make_dmoz_corpus.py
├── config
│ ├── lid_exp.json
│ ├── mine_segment.json
│ ├── test_reproduce.json
│ └── test_segment.json
├── pyproject.toml
├── setup.py
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── data
│ │ └── sample.warc.txt
│ ├── test_dedup.py
│ ├── test_flat_hash_set.py
│ ├── test_jsonql.py
│ ├── test_minify.py
│ ├── test_normalizer.py
│ ├── test_parse_wet_file.py
│ ├── test_regroup.py
│ └── test_transformer.py
└── train_all.sh
├── perplexity_lenses
├── README.md
├── app.py
├── cli.py
├── perplexity_lenses
│ ├── __init__.py
│ ├── data.py
│ ├── engine.py
│ ├── perplexity.py
│ └── visualization.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── tests
│ ├── __init__.py
│ └── test_data.py
├── pii-manager
├── .gitignore
├── CHANGES.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── doc
│ ├── contributing.md
│ ├── external.md
│ ├── tasks.md
│ └── usage.md
├── requirements.txt
├── setup.py
├── src
│ └── pii_manager
│ │ ├── __init__.py
│ │ ├── api
│ │ ├── __init__.py
│ │ ├── file.py
│ │ └── manager.py
│ │ ├── app
│ │ ├── __init__.py
│ │ ├── manage.py
│ │ └── task_info.py
│ │ ├── helper
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── context.py
│ │ ├── exception.py
│ │ ├── json.py
│ │ ├── normalizer.py
│ │ ├── taskdict.py
│ │ └── types.py
│ │ ├── lang
│ │ ├── __init__.py
│ │ ├── any
│ │ │ ├── __init__.py
│ │ │ ├── bitcoin_address.py
│ │ │ ├── credit_card.py
│ │ │ ├── email.py
│ │ │ └── ip_address.py
│ │ ├── en
│ │ │ ├── __init__.py
│ │ │ ├── any
│ │ │ │ ├── __init__.py
│ │ │ │ └── international_phone_number.py
│ │ │ ├── au
│ │ │ │ ├── __init__.py
│ │ │ │ ├── abn.py
│ │ │ │ └── tfn.py
│ │ │ ├── ca
│ │ │ │ ├── __init__.py
│ │ │ │ └── social_insurance_number.py
│ │ │ ├── in_
│ │ │ │ ├── __init__.py
│ │ │ │ └── aadhaar.py
│ │ │ └── us
│ │ │ │ ├── __init__.py
│ │ │ │ └── social_security_number.py
│ │ ├── es
│ │ │ ├── __init__.py
│ │ │ ├── any
│ │ │ │ ├── __init__.py
│ │ │ │ └── international_phone_number.py
│ │ │ ├── es
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bank_account.py
│ │ │ │ └── govid.py
│ │ │ └── mx
│ │ │ │ ├── __init__.py
│ │ │ │ └── curp.py
│ │ ├── fr
│ │ │ ├── __init__.py
│ │ │ └── ca
│ │ │ │ ├── __init__.py
│ │ │ │ └── social_insurance_number.py
│ │ ├── pt
│ │ │ ├── __init__.py
│ │ │ ├── br
│ │ │ │ ├── __init__.py
│ │ │ │ └── cpf.py
│ │ │ └── pt
│ │ │ │ ├── __init__.py
│ │ │ │ └── govid.py
│ │ └── zh
│ │ │ ├── __init__.py
│ │ │ └── cn
│ │ │ ├── __init__.py
│ │ │ ├── gov_id.py
│ │ │ └── misc.py
│ │ ├── piientity.py
│ │ └── piienum.py
└── test
│ ├── data
│ ├── extract-block.ndjson
│ ├── extract-line.ndjson
│ ├── extract-sentence.ndjson
│ ├── full-block.ndjson
│ ├── full-line.ndjson
│ ├── full-sentence.ndjson
│ ├── orig.txt
│ ├── replace.txt
│ ├── tag.txt
│ ├── taskfile-error.json
│ └── taskfile.json
│ └── unit
│ ├── api
│ ├── test_file.py
│ ├── test_file_taskfile.py
│ ├── test_manager.py
│ ├── test_manager_add.py
│ └── test_manager_ctx.py
│ ├── helper
│ ├── test_base.py
│ ├── test_context.py
│ ├── test_norm.py
│ └── test_taskdict.py
│ └── lang
│ ├── any
│ ├── test_bitcoin_address.py
│ ├── test_credit_card.py
│ ├── test_email.py
│ └── test_ip_address.py
│ ├── en
│ ├── any
│ │ └── test_ipn_en.py
│ ├── au
│ │ ├── test_abn.py
│ │ └── test_tfn.py
│ ├── ca
│ │ └── test_sin.py
│ ├── in_
│ │ └── test_aadhaar.py
│ └── us
│ │ └── test_ssn.py
│ ├── es
│ ├── any
│ │ └── test_ipn_es.py
│ ├── es
│ │ ├── test_bank_account.py
│ │ └── test_govid_es_es.py
│ └── mx
│ │ └── test_govid_es_mx.py
│ ├── pt
│ ├── br
│ │ └── test_govid_pt_br.py
│ └── pt
│ │ └── test_govid_pt_pt.py
│ └── zh
│ └── cn
│ ├── test_govid_zh_cn.py
│ └── test_misc.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── tokenizer
├── python_script
├── dedup_exact_article.py
├── dedup_lines.py
├── ram_dedup_lines.py
└── requirements.txt
└── scripts
├── 01_remove_deplicated_lines.sh
├── 02_remove_duplicated_lines_dataset_with_dataset_source.sh
├── 03_remove_duplicated_lines_alpha.sh
├── 04_remove_duplicated_lines_alpha _memory.sh
├── 05_remove_duplicated_lines_alpha __v2_memory.sh
└── 06_dedup_exact_examples.sh
/.github/workflows/label-with-contact-neede.yml:
--------------------------------------------------------------------------------
1 | name: Label with contact needed
2 | on:
3 | issue_comment:
4 | types: created
5 | jobs:
6 | one:
7 | runs-on: ubuntu-latest
8 | if: >-
9 | (github.event.comment.body == '#contact' ||
10 | github.event.comment.body == '#contact-needed')
11 | steps:
12 | - run: |
13 | echo "Labeling issue ${{ github.event.issue.number }} with 'contact needed'"
14 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["contact needed"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels
15 |
--------------------------------------------------------------------------------
/.github/workflows/label-with-help-wanted.yml:
--------------------------------------------------------------------------------
1 | name: Label with help wanted
2 | on:
3 | issue_comment:
4 | types: created
5 | jobs:
6 | one:
7 | runs-on: ubuntu-latest
8 | if: >-
9 | (github.event.comment.body == '#help' ||
10 | github.event.comment.body == '#help-wanted')
11 | steps:
12 | - run: |
13 | echo "Labeling issue ${{ github.event.issue.number }} with 'help wanted'"
14 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["help wanted"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels
15 |
--------------------------------------------------------------------------------
/.github/workflows/pii-manager.yml:
--------------------------------------------------------------------------------
1 | on:
2 | pull_request:
3 | branches:
4 | - master
5 | paths:
6 | - 'pii-manager/src/**'
7 | - 'pii-manager/test/**'
8 | - 'pii-manager/setup.py'
9 | - 'pii-manager/Makefile'
10 | - 'pii-manager/requirements.txt'
11 |
12 | jobs:
13 | build:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | max-parallel: 4
17 | matrix:
18 | python-version: [3.8]
19 |
20 | steps:
21 | - name: Set up Python ${{ matrix.python-version }}
22 | uses: actions/setup-python@v1
23 | with:
24 | python-version: ${{ matrix.python-version }}
25 | - name: Checkout main repository
26 | uses: actions/checkout@v2
27 | - name: Create venv
28 | run: |
29 | cd pii-manager
30 | VENV="$GITHUB_WORKSPACE/venv" make venv
31 | - name: Install package
32 | run: |
33 | cd pii-manager
34 | VENV="$GITHUB_WORKSPACE/venv" make install
35 | - name: Test with pytest
36 | run: |
37 | cd pii-manager
38 | VENV="$GITHUB_WORKSPACE/venv" make unit-verbose
39 |
--------------------------------------------------------------------------------
/.github/workflows/self-assign.yaml:
--------------------------------------------------------------------------------
1 | name: Self-assign
2 | on:
3 | issue_comment:
4 | types: created
5 | jobs:
6 | one:
7 | runs-on: ubuntu-latest
8 | if: >-
9 | (github.event.comment.body == '#take' ||
10 | github.event.comment.body == '#self-assign')
11 | steps:
12 | - run: |
13 | echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
14 | curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
15 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ac_dc/muliwai"]
2 | path = ac_dc/muliwai
3 | url = https://github.com/ontocord/muliwai
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # To use:
2 | #
3 | # pre-commit run -a
4 | #
5 | # Or:
6 | #
7 | # pre-commit install # (runs every time you commit in git)
8 | #
9 | # To update this file:
10 | #
11 | # pre-commit autoupdate
12 | #
13 | # See https://github.com/pre-commit/pre-commit
14 |
15 | repos:
16 | # Standard hooks
17 | - repo: https://github.com/pre-commit/pre-commit-hooks
18 | rev: v4.2.0
19 | hooks:
20 | - id: check-added-large-files
21 | - id: check-case-conflict
22 | - id: check-docstring-first
23 | exclude: ^pii_processing/
24 | - id: check-merge-conflict
25 | - id: check-symlinks
26 | - id: check-toml
27 | - id: check-yaml
28 | - id: debug-statements
29 | exclude: ^pii_processing/
30 | - id: end-of-file-fixer
31 | exclude: ^pii_processing/
32 | - id: mixed-line-ending
33 | - id: requirements-txt-fixer
34 | - id: trailing-whitespace
35 | exclude: ^pii_processing/
36 |
37 | - repo: https://github.com/asottile/pyupgrade
38 | rev: v2.32.1
39 | hooks:
40 | - id: pyupgrade
41 | exclude: ^pii_processing/
42 |
43 | #- repo: https://github.com/PyCQA/isort
44 | # rev: 5.10.0
45 | # hooks:
46 | # - id: isort
47 |
48 | # Black, the code formatter, natively supports pre-commit
49 | - repo: https://github.com/psf/black
50 | rev: 22.3.0 # Keep in sync with blacken-docs
51 | hooks:
52 | - id: black
53 | exclude: ^pii_processing/
54 |
55 | # Changes tabs to spaces
56 | - repo: https://github.com/Lucas-C/pre-commit-hooks
57 | rev: v1.1.14
58 | hooks:
59 | - id: remove-tabs
60 | exclude: ^(pii_processing|.*Makefile)
61 |
62 | - repo: https://github.com/shellcheck-py/shellcheck-py
63 | rev: v0.8.0.4
64 | hooks:
65 | - id: shellcheck
66 | exclude: ^(pii_processing/|cc_pseudo_crawl)
67 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: init
2 | init:
3 | poetry install --extras "torch"
4 | pre-commit install
5 |
6 | .PHONY: format
7 | format:
8 | pre-commit run -a
9 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/__init__.py
--------------------------------------------------------------------------------
/ac_dc/anonymization.py:
--------------------------------------------------------------------------------
1 | from muliwai.pii_regexes import detect_ner_with_regex_and_context
2 | from muliwai.pii_regexes import regex_rulebase
3 |
4 | trannum = str.maketrans("0123456789", "1111111111")
5 |
6 |
7 | def apply_regex_anonymization(
8 | sentence: str,
9 | lang_id: str,
10 | context_window: int = 20,
11 | anonymize_condition=None,
12 | tag_type={"IP_ADDRESS", "KEY", "ID", "PHONE", "USER", "EMAIL", "LICENSE_PLATE"},
13 | ) -> str:
14 | """
15 | Params:
16 | ==================
17 | sentence: str, the sentence to be anonymized
18 | lang_id: str, the language id of the sentence
19 | context_window: int, the context window size
20 | anonymize_condition: function, the anonymization condition
21 | tag_type: iterable, the tag types of the anonymization. All keys in regex_rulebase is None
22 | """
23 | if tag_type == None:
24 | tag_type = regex_rulebase.keys()
25 | lang_id = lang_id.split("_")[0]
26 | ner = detect_ner_with_regex_and_context(
27 | sentence=sentence,
28 | src_lang=lang_id,
29 | context_window=context_window,
30 | tag_type=tag_type,
31 | )
32 | if anonymize_condition:
33 | for (ent, start, end, tag) in ner:
34 | # we need to actually walk through and replace by start, end span.
35 | sentence = sentence.replace(ent, f" <{tag}> ")
36 | return sentence, ner
37 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_ar.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/ar"
17 | gcs: null
18 | cache: "outputs/ar_cache"
19 | output: "outputs/ar"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_bn.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/bn"
17 | gcs: null
18 | cache: "outputs/bn_cache"
19 | output: "outputs/bn"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_ca.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/ca"
17 | gcs: null
18 | cache: "outputs/ca_cache"
19 | output: "outputs/ca"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_en.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "space" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/en"
17 | gcs: null
18 | cache: "outputs/en_cache"
19 | output: "outputs/en"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_es.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/es"
17 | gcs: null
18 | cache: "outputs/es_cache"
19 | output: "outputs/es"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_eu.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/eu"
17 | gcs: null
18 | cache: "outputs/eu_cache"
19 | output: "outputs/eu"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_fr.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/fr"
17 | gcs: null
18 | cache: "outputs/fr_cache"
19 | output: "outputs/fr"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_gl.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 4 # size of the token window
3 | hamming_distance: 7 # similarity threshold out of 64 bits
4 | num_blocks: 8 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 80 # number of processes to run when hashing
10 | load_dataset:
11 | path: "oscar-corpus/OSCAR-2109"
12 | name: "deduplicated_gl"
13 | split: "train"
14 | use_auth_token: true
15 | load_from_disk:
16 | path: null
17 | gcs: null
18 | cache: "outputs/gl_cache"
19 | output: "outputs/gl"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_hi.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/hi"
17 | gcs: null
18 | cache: "outputs/hi_cache"
19 | output: "outputs/hi"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_id.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/id"
17 | gcs: null
18 | cache: "outputs/id_cache"
19 | output: "outputs/id"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_pt.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/pt"
17 | gcs: null
18 | cache: "outputs/pt_cache"
19 | output: "outputs/pt"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_ur.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/ur"
17 | gcs: null
18 | cache: "outputs/ur_cache"
19 | output: "outputs/ur"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_vi.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/vi"
17 | gcs: null
18 | cache: "outputs/vi_cache"
19 | output: "outputs/vi"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/conf/self_deduplicate_zh.yaml:
--------------------------------------------------------------------------------
1 | tokenization: "character" # character, punctuation or space
2 | window_size: 6 # size of the token window, average arabic word length is 5
3 | hamming_distance: 4 # similarity threshold out of 64 bits
4 | num_blocks: 6 # must be larger than the hamming_distance
5 | ignore_punctuation: true # ignore punctuation when hashing, cannot be true when punctuation is used for tokenization
6 | lowercase: true # lowercase the text when hashing
7 | text_column: "text" # column name for the text to be hashed
8 | index_column: "id" # column name for the index
9 | num_proc: 96 # number of processes to run when hashing
10 | load_dataset:
11 | path: null
12 | name: null
13 | split: null
14 | use_auth_token: false
15 | load_from_disk:
16 | path: "data/oscar_filtered_final/zh"
17 | gcs: null
18 | cache: "outputs/zh_cache"
19 | output: "outputs/zh"
20 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/deduplicate/__init__.py:
--------------------------------------------------------------------------------
1 | import regex as re
2 |
3 | PUNCTUATION_REGEX = re.compile(r"\p{P}")
4 | INTERNAL_HASH = "__dedup_hash__"
5 |
--------------------------------------------------------------------------------
/ac_dc/deduplicate/deduplicate/util.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import numpy as np
4 | import simhash
5 |
6 | from . import INTERNAL_HASH, PUNCTUATION_REGEX
7 |
8 |
9 | def hashing(
10 | record,
11 | column: str = "text",
12 | tokenization: str = "character",
13 | window_size: int = 4,
14 | ignore_punctuation: bool = True,
15 | lowercase: bool = True,
16 | output: str = INTERNAL_HASH,
17 | ) -> Dict[str, int]:
18 | """Hashing a document with SimHash.
19 |
20 | Parameters
21 | ----------
22 | record : [type]
23 | A dict of feature-value pairs
24 | column : str, optional
25 | The column name to use for hashing, by default "text"
26 | tokenization : str, optional
27 | Method to use for tokenization, by default "character"
28 | window_size : int, optional
29 | The size of the token window, by default 4
30 | ignore_punctuation : bool, optional
31 | To ignore punctuation or not, by default True
32 | lowercase : bool, optional
33 | To lowercase the text or not, by default True
34 |
35 | Returns
36 | -------
37 | Dict[str, int]
38 | The new hash feature column
39 |
40 | Raises
41 | ------
42 | Exception
43 | Unrecognized tokenization parameter
44 | """
45 | document = record[column]
46 | if lowercase:
47 | document = document.lower()
48 |
49 | if ignore_punctuation:
50 | document = PUNCTUATION_REGEX.sub("", document)
51 |
52 | if tokenization == "character":
53 | tokens = [
54 | str.encode(document[i : i + window_size])
55 | for i in range(len(document) - window_size)
56 | ]
57 | elif tokenization == "punctuation":
58 | tokens = PUNCTUATION_REGEX.split(document)
59 | tokens = [
60 | str.encode(" ".join(tokens[i : i + window_size]))
61 | for i in range(len(tokens) - window_size)
62 | ]
63 | elif tokenization == "space":
64 | tokens = document.split(" ")
65 | tokens = [
66 | str.encode(" ".join(tokens[i : i + window_size]))
67 | for i in range(len(tokens) - window_size)
68 | ]
69 | else:
70 | raise Exception(f"Unrecognized tokenization parameter {tokenization}")
71 |
72 | return {output: np.uint64(simhash.compute(map(simhash.unsigned_hash, tokens)))}
73 |
--------------------------------------------------------------------------------
/ac_dc/download_sentencepiece_kenlm_models.py:
--------------------------------------------------------------------------------
1 | """Download Sentencepiece and KenLM models for supported languages.
2 |
3 | Usage:
4 | python download_sentencepiece_kenlm_models.py --output_dir_path /tmp/
5 |
6 | All Sentencepiece and KenLM language models will be saved under /tmp.
7 | """
8 |
9 | import argparse
10 | import subprocess
11 |
12 | from languages_id import langs_id
13 |
14 |
15 | def download_sentencepiece_kenlm_models(output_dir_path: str) -> None:
16 | supported_sentencepiece_langs = langs_id["sentencepiece_id"].dropna().unique()
17 | for lang in supported_sentencepiece_langs:
18 | try:
19 | output_sentencepiece = subprocess.check_output(
20 | f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.sp.model -P {output_dir_path}", # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model for FB models
21 | shell=True,
22 | )
23 | except:
24 | print(
25 | f"Warning: Download failed for Sentencepiece model for language {lang}."
26 | )
27 |
28 | supported_kenlm_langs = langs_id["kenlm_id"].dropna().unique()
29 | for lang in supported_kenlm_langs:
30 | try:
31 | output_kenlm = subprocess.check_output(
32 | f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.arpa.bin -P {output_dir_path}", # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin for FB models
33 | shell=True,
34 | )
35 | except:
36 | print(f"Warning: Download failed for KenLM model for language {lang}.")
37 |
38 |
39 | if __name__ == "__main__":
40 | parser = argparse.ArgumentParser(
41 | description="Download Sentencepiece and KenLM models for supported languages."
42 | )
43 | parser.add_argument(
44 | "--output_dir_path",
45 | type=str,
46 | default="/tmp/",
47 | help="Output directory path to save models.",
48 | )
49 | args = parser.parse_args()
50 |
51 | download_sentencepiece_kenlm_models(output_dir_path=args.output_dir_path)
52 |
--------------------------------------------------------------------------------
/ac_dc/explanation_filtering_pipeline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/ac_dc/explanation_filtering_pipeline.pdf
--------------------------------------------------------------------------------
/ac_dc/normalization.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Dict
3 |
4 |
5 | non_printing_characters_re = re.compile(
6 | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
7 | )
8 |
9 | digits_re: re.Pattern = re.compile(r"\d")
10 |
11 | unicode_punctuation: Dict[str, str] = {
12 | ",": ",",
13 | "。": ".",
14 | "、": ",",
15 | "„": '"',
16 | "”": '"',
17 | "“": '"',
18 | "«": '"',
19 | "»": '"',
20 | "1": '"',
21 | "」": '"',
22 | "「": '"',
23 | "《": '"',
24 | "》": '"',
25 | "´": "'",
26 | "∶": ":",
27 | ":": ":",
28 | "?": "?",
29 | "!": "!",
30 | "(": "(",
31 | ")": ")",
32 | ";": ";",
33 | "–": "-",
34 | "—": " - ",
35 | ".": ". ",
36 | "~": "~",
37 | "’": "'",
38 | "…": "...",
39 | "━": "-",
40 | "〈": "<",
41 | "〉": ">",
42 | "【": "[",
43 | "】": "]",
44 | "%": "%",
45 | "►": "-",
46 | }
47 |
48 | normalization = {
49 | "non_printing_characters_re": non_printing_characters_re,
50 | "digits_re": digits_re,
51 | "unicode_punctuation": unicode_punctuation,
52 | }
53 |
--------------------------------------------------------------------------------
/ac_dc/person_and_id_anonymization.py:
--------------------------------------------------------------------------------
1 | from muliwai.regex_manager import detect_ner_with_regex_and_context
2 | from muliwai.pii_regexes_rulebase import regex_rulebase
3 | from muliwai.ner_manager import detect_ner_with_hf_model
4 | from muliwai.faker_manager import augment_anonymize
5 |
6 |
7 | def apply_anonymization(
8 | sentence: str,
9 | lang_id: str,
10 | context_window: int = 20,
11 | anonymize_condition=None,
12 | tag_type={
13 | "IP_ADDRESS",
14 | "KEY",
15 | "ID",
16 | "PHONE",
17 | "USER",
18 | "EMAIL",
19 | "LICENSE_PLATE",
20 | "PERSON",
21 | },
22 | device: str = "cpu",
23 | ) -> str:
24 | """
25 | Params:
26 | ==================
27 | sentence: str, the sentence to be anonymized
28 | lang_id: str, the language id of the sentence
29 | context_window: int, the context window size
30 | anonymize_condition: function, the anonymization condition
31 | tag_type: iterable, the tag types of the anonymization. By default: {'IP_ADDRESS', 'KEY', 'ID', 'PHONE', 'USER', 'EMAIL', 'LICENSE_PLATE', 'PERSON'}
32 | device: cpu or cuda:{device_id}
33 |
34 | """
35 | if tag_type == None:
36 | tag_type = regex_rulebase.keys()
37 | lang_id = lang_id.split("_")[0]
38 | ner_ids = detect_ner_with_regex_and_context(
39 | sentence=sentence,
40 | src_lang=lang_id,
41 | context_window=context_window,
42 | tag_type=tag_type,
43 | )
44 | ner_persons = detect_ner_with_hf_model(
45 | sentence=sentence,
46 | src_lang=lang_id,
47 | device=device,
48 | )
49 | ner = ner_ids + ner_persons
50 | if anonymize_condition:
51 | new_sentence, new_ner, _ = augment_anonymize(
52 | sentence,
53 | lang_id,
54 | ner,
55 | )
56 | doc = {
57 | "text": new_sentence,
58 | "ner": new_ner,
59 | "orig_text": sentence,
60 | "orig_ner": ner,
61 | }
62 | else:
63 | new_sentence = sentence
64 | doc = {"text": new_sentence, "ner": ner}
65 | return new_sentence, doc
66 |
--------------------------------------------------------------------------------
/ac_dc/test_anonymization.py:
--------------------------------------------------------------------------------
1 | import random
2 | from anonymization import apply_regex_anonymization
3 | from faker import Faker
4 | from num2words import num2words
5 |
6 | # We may need to include other test scenarios
7 | # Wherever possible, test with faker
8 |
9 |
10 | def main():
11 | test_suite = {"English": test_en, "Chinese": test_zh}
12 | for language, test_func in test_suite.items():
13 | print("Testing {}".format(language))
14 | test_func()
15 | print("==================================================")
16 |
17 |
18 | def test_en():
19 | fake = Faker("en_US")
20 | sentences = [
21 | f"I am {num2words(random.randint(0,120))} years old, and she is {random.randint(0,120)} year-old", # Age
22 | f"Sherry lives at {fake.street_address()}", # Address
23 | f"My dad is a cancer fighter. Her grandma is suffering from alzheimer's", # Disease
24 | f"Let me tell you a secret, Mr. Nguyen's SSN is {fake.ssn() if random.choice([True, False]) else fake.ssn().replace('-', '')}.", # Government ID
25 | f"Dear Ian, the payment through {fake.credit_card_number()} has been successfully executed.", # Credit card
26 | ]
27 | for sentence in sentences:
28 | print(
29 | apply_regex_anonymization(
30 | sentence=sentence, lang_id="en", anonymize_condition=True
31 | )
32 | )
33 |
34 |
35 | def test_zh():
36 | fake = Faker("zh_CN")
37 | sentences = [
38 | f'我今年{num2words(random.randint(0,120), lang="ja")}歲, 而她去年{random.randint(0,120)}岁', # Age
39 | f"我住在{fake.street_address()}", # Address
40 | f"我爸是抗癌戰士。她奶奶有老人癡呆", # Disease
41 | f"李雪妮小姐331125198402010129", # Government ID
42 | f"先生,信用卡号{fake.credit_card_number()}已缴费成功", # Credit card
43 | ]
44 | for sentence in sentences:
45 | print(
46 | apply_regex_anonymization(
47 | sentence=sentence, lang_id="zh", anonymize_condition=True
48 | )
49 | )
50 |
51 |
52 | if __name__ == "__main__":
53 | main()
54 |
--------------------------------------------------------------------------------
/ac_dc/visualization/README.md:
--------------------------------------------------------------------------------
1 | # Visualization tool
2 |
3 | Use this visualization tool online at https://huggingface.co/spaces/huggingface/text-data-filtering.
4 |
5 | However, by running the code on your computer, it is faster, it can handle in practice up to three times more documents, and it works for every language.
6 |
7 | 1) Use get_data_for_visualization.py to get the json gathering examples with their computed statistics for the language you chose.
8 | It uses the streaming mode of the Datasets library, so no need to download the dataset, but you have to download the fasttext model (for the language identification) and the sentencepiece / kenlm models (for the tokenization and the perplexity).
9 |
10 | 2) Specify the path to this json and the fasttext / sentencepiece / kenlm models in visualization.py and run the command "streamlit run ac_dc/visualization/visualization.py".
11 |
--------------------------------------------------------------------------------
/bertin/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "RobertaForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "bos_token_id": 0,
7 | "eos_token_id": 2,
8 | "gradient_checkpointing": false,
9 | "hidden_act": "gelu",
10 | "hidden_dropout_prob": 0.1,
11 | "hidden_size": 768,
12 | "initializer_range": 0.02,
13 | "intermediate_size": 3072,
14 | "layer_norm_eps": 1e-05,
15 | "max_position_embeddings": 514,
16 | "model_type": "roberta",
17 | "num_attention_heads": 12,
18 | "num_hidden_layers": 12,
19 | "pad_token_id": 1,
20 | "position_embedding_type": "absolute",
21 | "transformers_version": "4.9.0.dev0",
22 | "type_vocab_size": 1,
23 | "use_cache": true,
24 | "vocab_size": 50265
25 | }
26 |
--------------------------------------------------------------------------------
/bertin/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from transformers import RobertaConfig
3 |
4 | config = RobertaConfig.from_pretrained("roberta-large")
5 | config.save_pretrained("./configs/large")
6 |
7 | config = RobertaConfig.from_pretrained("roberta-base")
8 | config.save_pretrained("./configs/base")
9 |
--------------------------------------------------------------------------------
/bertin/configs/base/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "RobertaForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "bos_token_id": 0,
7 | "eos_token_id": 2,
8 | "gradient_checkpointing": false,
9 | "hidden_act": "gelu",
10 | "hidden_dropout_prob": 0.1,
11 | "hidden_size": 768,
12 | "initializer_range": 0.02,
13 | "intermediate_size": 3072,
14 | "layer_norm_eps": 1e-05,
15 | "max_position_embeddings": 514,
16 | "model_type": "roberta",
17 | "num_attention_heads": 12,
18 | "num_hidden_layers": 12,
19 | "pad_token_id": 1,
20 | "position_embedding_type": "absolute",
21 | "transformers_version": "4.9.0.dev0",
22 | "type_vocab_size": 1,
23 | "use_cache": true,
24 | "vocab_size": 50265
25 | }
26 |
--------------------------------------------------------------------------------
/bertin/configs/large/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "RobertaForMaskedLM"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "bos_token_id": 0,
7 | "eos_token_id": 2,
8 | "gradient_checkpointing": false,
9 | "hidden_act": "gelu",
10 | "hidden_dropout_prob": 0.1,
11 | "hidden_size": 1024,
12 | "initializer_range": 0.02,
13 | "intermediate_size": 4096,
14 | "layer_norm_eps": 1e-05,
15 | "max_position_embeddings": 514,
16 | "model_type": "roberta",
17 | "num_attention_heads": 16,
18 | "num_hidden_layers": 24,
19 | "pad_token_id": 1,
20 | "position_embedding_type": "absolute",
21 | "transformers_version": "4.9.0.dev0",
22 | "type_vocab_size": 1,
23 | "use_cache": true,
24 | "vocab_size": 50265
25 | }
26 |
--------------------------------------------------------------------------------
/bertin/convert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import tempfile
3 |
4 | import jax
5 | from jax import numpy as jnp
6 | from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM
7 |
8 |
9 | def to_f32(t):
10 | return jax.tree_map(
11 | lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t
12 | )
13 |
14 |
15 | def main():
16 | # Saving extra files from config.json and tokenizer.json files
17 | tokenizer = AutoTokenizer.from_pretrained("./")
18 | tokenizer.save_pretrained("./")
19 |
20 | # Temporary saving bfloat16 Flax model into float32
21 | tmp = tempfile.mkdtemp()
22 | flax_model = FlaxRobertaForMaskedLM.from_pretrained("./")
23 | flax_model.params = to_f32(flax_model.params)
24 | flax_model.save_pretrained(tmp)
25 | # Converting float32 Flax to PyTorch
26 | model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True)
27 | model.save_pretrained("./", save_config=False)
28 |
29 |
30 | if __name__ == "__main__":
31 | main()
32 |
--------------------------------------------------------------------------------
/bertin/evaluation/paws.yaml:
--------------------------------------------------------------------------------
1 | name: BERTIN PAWS-X es
2 | project: bertin-eval
3 | enitity: versae
4 | program: run_glue.py
5 | command:
6 | - ${env}
7 | - ${interpreter}
8 | - ${program}
9 | - ${args}
10 | method: grid
11 | metric:
12 | name: eval/accuracy
13 | goal: maximize
14 | parameters:
15 | model_name_or_path:
16 | values:
17 | - bertin-project/bertin-base-gaussian-exp-512seqlen
18 | - bertin-project/bertin-base-stepwise-exp-512seqlen
19 | - bertin-project/bertin-base-random-exp-512seqlen
20 | - bertin-project/bertin-base-gaussian
21 | - bertin-project/bertin-base-stepwise
22 | - bertin-project/bertin-base-random
23 | - bertin-project/bertin-roberta-base-spanish
24 | - flax-community/bertin-roberta-large-spanish
25 | - BSC-TeMU/roberta-base-bne
26 | - dccuchile/bert-base-spanish-wwm-cased
27 | - bert-base-multilingual-cased
28 | num_train_epochs:
29 | values: [5]
30 | task_name:
31 | value: paws-x
32 | dataset_name:
33 | value: paws-x
34 | dataset_config_name:
35 | value: es
36 | output_dir:
37 | value: ./outputs
38 | overwrite_output_dir:
39 | value: true
40 | max_seq_length:
41 | value: 512
42 | pad_to_max_length:
43 | value: true
44 | per_device_train_batch_size:
45 | value: 16
46 | per_device_eval_batch_size:
47 | value: 16
48 | save_total_limit:
49 | value: 1
50 | do_train:
51 | value: true
52 | do_eval:
53 | value: true
54 |
--------------------------------------------------------------------------------
/bertin/evaluation/token.yaml:
--------------------------------------------------------------------------------
1 | name: BERTIN NER and POS es
2 | project: bertin-eval
3 | enitity: versae
4 | program: run_ner.py
5 | command:
6 | - ${env}
7 | - ${interpreter}
8 | - ${program}
9 | - ${args}
10 | method: grid
11 | metric:
12 | name: eval/accuracy
13 | goal: maximize
14 | parameters:
15 | model_name_or_path:
16 | values:
17 | - bertin-project/bertin-base-gaussian-exp-512seqlen
18 | - bertin-project/bertin-base-stepwise-exp-512seqlen
19 | - bertin-project/bertin-base-random-exp-512seqlen
20 | - bertin-project/bertin-base-gaussian
21 | - bertin-project/bertin-base-stepwise
22 | - bertin-project/bertin-base-random
23 | - bertin-project/bertin-roberta-base-spanish
24 | - flax-community/bertin-roberta-large-spanish
25 | - BSC-TeMU/roberta-base-bne
26 | - dccuchile/bert-base-spanish-wwm-cased
27 | - bert-base-multilingual-cased
28 | num_train_epochs:
29 | values: [5]
30 | task_name:
31 | values:
32 | - ner
33 | - pos
34 | dataset_name:
35 | value: conll2002
36 | dataset_config_name:
37 | value: es
38 | output_dir:
39 | value: ./outputs
40 | overwrite_output_dir:
41 | value: true
42 | pad_to_max_length:
43 | value: true
44 | per_device_train_batch_size:
45 | value: 16
46 | per_device_eval_batch_size:
47 | value: 16
48 | save_total_limit:
49 | value: 1
50 | do_train:
51 | value: true
52 | do_eval:
53 | value: true
54 |
--------------------------------------------------------------------------------
/bertin/evaluation/xnli.yaml:
--------------------------------------------------------------------------------
1 | name: BERTIN XNLI es
2 | project: bertin-eval
3 | enitity: versae
4 | program: run_glue.py
5 | command:
6 | - ${env}
7 | - ${interpreter}
8 | - ${program}
9 | - ${args}
10 | method: grid
11 | metric:
12 | name: eval/accuracy
13 | goal: maximize
14 | parameters:
15 | model_name_or_path:
16 | values:
17 | - bertin-project/bertin-base-gaussian-exp-512seqlen
18 | - bertin-project/bertin-base-stepwise-exp-512seqlen
19 | - bertin-project/bertin-base-random-exp-512seqlen
20 | - bertin-project/bertin-base-gaussian
21 | - bertin-project/bertin-base-stepwise
22 | - bertin-project/bertin-base-random
23 | - bertin-project/bertin-roberta-base-spanish
24 | - flax-community/bertin-roberta-large-spanish
25 | - BSC-TeMU/roberta-base-bne
26 | - dccuchile/bert-base-spanish-wwm-cased
27 | - bert-base-multilingual-cased
28 | num_train_epochs:
29 | values: [5]
30 | task_name:
31 | value: xnli
32 | dataset_name:
33 | value: xnli
34 | dataset_config_name:
35 | value: es
36 | output_dir:
37 | value: ./outputs
38 | overwrite_output_dir:
39 | value: true
40 | max_seq_length:
41 | value: 512
42 | pad_to_max_length:
43 | value: true
44 | per_device_train_batch_size:
45 | value: 16
46 | per_device_eval_batch_size:
47 | value: 16
48 | save_total_limit:
49 | value: 1
50 | do_train:
51 | value: true
52 | do_eval:
53 | value: true
54 |
--------------------------------------------------------------------------------
/bertin/events.out.tfevents.1625704081.t1v-n-a4d97d44-w-0.212075.3.v2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6a6ce71bd4a3fdcb18c10bd9d140b27e746c14e9ee70a7a3faf4eedbccde1d6e
3 | size 40
4 |
--------------------------------------------------------------------------------
/bertin/events.out.tfevents.1625704245.t1v-n-a4d97d44-w-0.216676.3.v2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2a6926c79cb2c1941fcfe69d7b73797c15dab60e5e6f16cc6c61bd9b79a9063d
3 | size 40
4 |
--------------------------------------------------------------------------------
/bertin/events.out.tfevents.1625705283.t1v-n-a4d97d44-w-0.234462.3.v2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:737d1e6666fe1c9fd6dd93728666199f1a8b0b213b071bdf7b3ecd77dd58f8c1
3 | size 40
4 |
--------------------------------------------------------------------------------
/bertin/get_embeddings_and_perplexity.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import kenlm
3 | import numpy as np
4 | import pandas as pd
5 | from datasets import load_dataset
6 | from sentence_transformers import SentenceTransformer
7 | from tqdm import tqdm
8 |
9 | TOTAL_SENTENCES = 20000
10 |
11 |
12 | def pp(log_score, length):
13 | return 10.0 ** (-log_score / length)
14 |
15 |
16 | embedder = "distiluse-base-multilingual-cased-v1"
17 | embedder_model = SentenceTransformer(embedder)
18 | embedding_shape = embedder_model.encode(["foo"])[0].shape[0]
19 | # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
20 | model = kenlm.Model("es.arpa.bin")
21 | mc4 = load_dataset("mc4", "es", streaming=True)
22 | count = 0
23 | embeddings = []
24 | lenghts = []
25 | perplexities = []
26 | sentences = []
27 |
28 | for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992):
29 | lines = sample["text"].split("\n")
30 | for line in lines:
31 | count += 1
32 | log_score = model.score(line)
33 | length = len(line.split()) + 1
34 | embedding = embedder_model.encode([line])[0]
35 | embeddings.append(embedding.tolist())
36 | perplexities.append(pp(log_score, length))
37 | lenghts.append(length)
38 | sentences.append(line)
39 | if count == TOTAL_SENTENCES:
40 | break
41 | if count == TOTAL_SENTENCES:
42 | embeddings = np.array(embeddings)
43 | df = pd.DataFrame(
44 | {"sentence": sentences, "length": lenghts, "perplexity": perplexities}
45 | )
46 | for dim in range(embedding_shape):
47 | df[f"dim_{dim}"] = embeddings[:, dim]
48 | df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t")
49 | print("DONE!")
50 | break
51 |
--------------------------------------------------------------------------------
/bertin/images/bertin-tilt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/bertin-tilt.png
--------------------------------------------------------------------------------
/bertin/images/bertin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/bertin.png
--------------------------------------------------------------------------------
/bertin/images/ccnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/ccnet.png
--------------------------------------------------------------------------------
/bertin/images/datasets-perp-20-120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-perp-20-120.png
--------------------------------------------------------------------------------
/bertin/images/datasets-perp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-perp.png
--------------------------------------------------------------------------------
/bertin/images/datasets-random-comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-random-comparison.png
--------------------------------------------------------------------------------
/bertin/images/datasets-wsize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/datasets-wsize.png
--------------------------------------------------------------------------------
/bertin/images/perp-p95.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/perp-p95.png
--------------------------------------------------------------------------------
/bertin/images/perp-resample-gaussian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/perp-resample-gaussian.png
--------------------------------------------------------------------------------
/bertin/images/perp-resample-stepwise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/perp-resample-stepwise.png
--------------------------------------------------------------------------------
/bertin/images/random_512.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/images/random_512.jpg
--------------------------------------------------------------------------------
/bertin/mc4/dummy/af/0.0.0/dummy_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/bertin/mc4/dummy/af/0.0.0/dummy_data.zip
--------------------------------------------------------------------------------
/bertin/perplexity.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import kenlm
3 | from datasets import load_dataset
4 | from tqdm import tqdm
5 |
6 |
7 | def pp(log_score, length):
8 | return 10.0 ** (-log_score / length)
9 |
10 |
11 | # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
12 | model = kenlm.Model("es.arpa.bin")
13 | mc4 = load_dataset("mc4", "es", streaming=True)
14 | with open("mc4-es-perplexity.txt", "w") as f:
15 | for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992):
16 | lines = sample["text"].split("\n")
17 | doc_log_score, doc_length = 0, 0
18 | for line in lines:
19 | log_score = model.score(line)
20 | length = len(line.split()) + 1
21 | doc_log_score += log_score
22 | doc_length += length
23 | f.write(f"{pp(doc_log_score, doc_length)}\n")
24 |
--------------------------------------------------------------------------------
/bertin/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # From https://arxiv.org/pdf/1907.11692.pdf
3 | python -c "import jax; print('TPUs', jax.device_count())"
4 | ./run_mlm_flax.py \
5 | --output_dir="./outputs" \
6 | --model_type="roberta" \
7 | --config_name="./configs/large" \
8 | --tokenizer_name="./" \
9 | --dataset_name="mc4" \
10 | --dataset_config_name="es" \
11 | --dataset_streamnig \
12 | --max_seq_length="128" \
13 | --pad_to_max_length \
14 | --per_device_train_batch_size="128" \
15 | --per_device_eval_batch_size="128" \
16 | --adam_beta1="0.9" \
17 | --adam_beta2="0.98" \
18 | --adam_epsilon="1e-6" \
19 | --learning_rate="4e-4" \
20 | --weight_decay="0.01" \
21 | --save_strategy="steps" \
22 | --save_steps="10000" \
23 | --save_total_limit="5" \
24 | --warmup_steps="30000" \
25 | --overwrite_output_dir \
26 | --num_train_steps="500000" \
27 | --eval_steps="10000" \
28 | --logging_steps="500" \
29 | --dtype="bfloat16" 2>&1 | tee run.log
30 |
--------------------------------------------------------------------------------
/bertin/run_stream.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # From https://arxiv.org/pdf/1907.11692.pdf for base model
3 | python -c "import jax; print('TPUs', jax.device_count())"
4 | python ./run_mlm_flax_stream.py \
5 | --output_dir="./outputs" \
6 | --model_type="roberta" \
7 | --config_name="./configs/base" \
8 | --tokenizer_name="./configs/base" \
9 | --dataset_name="./mc4" \
10 | --dataset_config_name="es" \
11 | --train_file="path/to/mc4-es-train-50M-XXX.jsonl" \
12 | --max_seq_length="128" \
13 | --pad_to_max_length \
14 | --per_device_train_batch_size="256" \
15 | --per_device_eval_batch_size="256" \
16 | --adam_beta1="0.9" \
17 | --adam_beta2="0.98" \
18 | --adam_epsilon="1e-6" \
19 | --learning_rate="6e-4" \
20 | --weight_decay="0.01" \
21 | --save_steps="10000" \
22 | --save_total_limit="5" \
23 | --warmup_steps="24000" \
24 | --overwrite_output_dir \
25 | --num_train_steps="250000" \
26 | --eval_steps="10000" \
27 | --dtype="bfloat16" \
28 | --logging_steps="500" 2>&1 | tee run_stream.log
29 |
--------------------------------------------------------------------------------
/bertin/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "", "eos_token": "", "unk_token": "", "sep_token": "", "pad_token": "", "cls_token": "", "mask_token": {"content": "", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
2 |
--------------------------------------------------------------------------------
/bertin/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "", "bos_token": "", "eos_token": "", "add_prefix_space": false, "errors": "replace", "sep_token": "", "cls_token": "", "pad_token": "", "mask_token": "", "special_tokens_map_file": null, "name_or_path": "./", "tokenizer_class": "RobertaTokenizer"}
2 |
--------------------------------------------------------------------------------
/bertin/tokens.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from datasets import load_dataset
3 | from tokenizers import ByteLevelBPETokenizer
4 |
5 | # Load dataset
6 | dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]")
7 |
8 | # Instantiate tokenizer
9 | tokenizer = ByteLevelBPETokenizer()
10 |
11 |
12 | def batch_iterator(batch_size=100_000):
13 | for i in range(0, len(dataset), batch_size):
14 | yield dataset["text"][i : i + batch_size]
15 |
16 |
17 | # Customized training
18 | tokenizer.train_from_iterator(
19 | batch_iterator(),
20 | vocab_size=50265,
21 | min_frequency=2,
22 | special_tokens=[
23 | "",
24 | "",
25 | "",
26 | "",
27 | "",
28 | ],
29 | )
30 | # Save files to disk
31 | tokenizer.save("./tokenizer.json")
32 |
--------------------------------------------------------------------------------
/bertin/tokens.py.orig:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from datasets import load_dataset
3 | from tokenizers import ByteLevelBPETokenizer
4 |
5 | # Load dataset
6 | <<<<<<< HEAD
7 | dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train[:5000000]")
8 |
9 | # Instantiate tokenizer
10 | tokenizer = ByteLevelBPETokenizer()
11 | def batch_iterator(batch_size=100_000):
12 | =======
13 | dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
14 |
15 | # Instantiate tokenizer
16 | tokenizer = ByteLevelBPETokenizer()
17 | def batch_iterator(batch_size=1_000_000):
18 | >>>>>>> d5cede47e74aa6ec36f20acf5aba37c6734c6186
19 | for i in range(0, len(dataset), batch_size):
20 | yield dataset["text"][i: i + batch_size]
21 |
22 | # Customized training
23 | tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
24 | "",
25 | "",
26 | "",
27 | "",
28 | "",
29 | ])
30 | # Save files to disk
31 | tokenizer.save("./tokenizer.json")
32 |
--------------------------------------------------------------------------------
/bertin/utils/dataset_perplexity.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import kenlm
4 | from tqdm import tqdm
5 |
6 | model = kenlm.Model("../es.arpa.bin")
7 |
8 |
9 | def get_perplexity(doc):
10 | doc_log_score, doc_length = 0, 0
11 | for line in doc.split("\n"):
12 | log_score = model.score(line)
13 | length = len(line.split()) + 1
14 | doc_log_score += log_score
15 | doc_length += length
16 | return 10.0 ** (-doc_log_score / doc_length)
17 |
18 |
19 | with open("mc4-es-train-50M-stats.csv", "w") as csv:
20 | with open("mc4-es-train-50M-steps.jsonl", "r") as data:
21 | for line in tqdm(data):
22 | text = json.loads(line)["text"]
23 | csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
24 |
--------------------------------------------------------------------------------
/bertin/utils/download_mc4es_sampled.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import io
3 | import json
4 | import sys
5 |
6 | import requests
7 | from tqdm import tqdm
8 |
9 | _DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz"
10 |
11 |
12 | def main(config="stepwise"):
13 | data_urls = [
14 | _DATA_URL_TRAIN.format(
15 | config=config,
16 | index=index + 1,
17 | n_shards=1024,
18 | )
19 | for index in range(1024)
20 | ]
21 | with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f:
22 | for dara_url in tqdm(data_urls):
23 | response = requests.get(dara_url)
24 | bio = io.BytesIO(response.content)
25 | with gzip.open(bio, "rt", encoding="utf8") as g:
26 | for line in g:
27 | json_line = json.loads(line.strip())
28 | f.write(json.dumps(json_line) + "\n")
29 |
30 |
31 | if __name__ == "__main__":
32 | main(sys.argv[1])
33 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/get_stats.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from argparse import ArgumentParser
3 | from pathlib import Path
4 |
5 | from datasets import concatenate_datasets, load_dataset, load_from_disk
6 | from datasets.utils.logging import set_verbosity_info
7 |
8 | set_verbosity_info()
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | def get_args():
13 | parser = ArgumentParser()
14 | parser.add_argument("--dataset-path", type=str, required=True, help="Dataset path.")
15 | args = parser.parse_args()
16 |
17 | args.dataset_path = Path(args.dataset_path)
18 | return args
19 |
20 |
21 | def load_others(dataset_path: Path):
22 | others_path = dataset_path / "others"
23 | shards = [
24 | load_from_disk(str(shard_path.absolute()))
25 | for shard_path in sorted(others_path.iterdir())
26 | ]
27 | return concatenate_datasets(shards)
28 |
29 |
30 | def main():
31 | # Setup logging
32 | logging.basicConfig(
33 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
34 | datefmt="%m/%d/%Y %H:%M:%S",
35 | level=logging.INFO,
36 | )
37 | args = get_args()
38 | logger.info(
39 | f"** The job is runned with the following arguments: **\n{args}\n **** "
40 | )
41 |
42 | others = load_others(args.dataset_path)
43 | features = others.features.copy()
44 | features.pop("compressed_warc")
45 | text_htmls = load_dataset(
46 | str((args.dataset_path / "text__html").absolute()),
47 | data_files="**.jsonl.gz",
48 | features=features,
49 | split="train",
50 | )
51 |
52 | logger.info(f"Text/html: {len(text_htmls)}")
53 | logger.info(f"Others: {len(others)}")
54 |
55 |
56 | if __name__ == "__main__":
57 | main()
58 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/language_annotation/python_scripts/check_wrong_files.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | bug_features = 0
4 | bug_pyarrow = 0
5 | bug_segmentation = 0
6 | bug_oom = 0
7 | bug_other = 0
8 |
9 | directory = "/Users/hugolaurencon/Desktop/HF/Code/clean_crawl/annotate_langid_crawl"
10 | for filename in os.listdir(directory):
11 | f = os.path.join(directory, filename)
12 | if os.path.isfile(f):
13 | with open(f, encoding="utf8", errors="ignore") as file:
14 | # file = open(f, 'rb')
15 | txt = file.read()
16 | # file.close()
17 | if (
18 | "FileNotFoundError: Unable to resolve any data file that matches" in txt
19 | ) or ("Shard successfully saved" in txt):
20 | os.remove(f)
21 | elif (
22 | "ValueError: Please pass `features` or at least one example when writing data"
23 | in txt
24 | ):
25 | bug_features += 1
26 | elif "Segmentation fault (core dumped) python" in txt:
27 | bug_segmentation += 1
28 | elif "slurmstepd: error: Detected 1 oom-kill event(s)" in txt:
29 | bug_oom += 1
30 | elif "pyarrow.lib.ArrowNotImplementedError:" in txt:
31 | bug_pyarrow += 1
32 | else:
33 | bug_other += 1
34 | print(f)
35 |
36 | print("bug_features:", bug_features)
37 | print("bug_pyarrow:", bug_pyarrow)
38 | print("bug_segmentation :", bug_segmentation)
39 | print("bug_oom:", bug_oom)
40 | print("bug_other:", bug_other)
41 | print("Tot bug:", bug_features + bug_pyarrow + bug_segmentation + bug_oom + bug_other)
42 | print("Num files:", len(os.listdir(directory)))
43 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/language_annotation/python_scripts/compute_stats_langid.py:
--------------------------------------------------------------------------------
1 | from datasets import load_from_disk
2 |
3 | from multiprocessing import cpu_count
4 |
5 | import json
6 |
7 | # import random # A DECOMMENTER
8 |
9 | dataset = load_from_disk(
10 | "/gpfsscratch/rech/six/urd43gx/crawl/annotated_langid_crawl"
11 | ) # "/Users/hugolaurencon/Desktop/HF/Code/dataset_filtered/af/"
12 | dataset = dataset["train"] # A COMMENTER
13 | print("Dataset loaded")
14 |
15 | dataset = dataset.map(
16 | lambda example: {
17 | "pred_lang": example["fasttext_pred"]["lang_pred_fasttext_id"],
18 | "len_text": len(example["text"]),
19 | }, # random.choice(["A", "B", "C"])
20 | remove_columns=dataset.column_names,
21 | num_proc=cpu_count(),
22 | )
23 |
24 | stats_langid = {}
25 | for i in range(dataset.num_rows):
26 | pred_lang = dataset[i]["pred_lang"]
27 | len_text = dataset[i]["len_text"]
28 | stats_langid[pred_lang] = stats_langid.get(pred_lang, 0) + len_text
29 |
30 | f = open(
31 | "/gpfswork/rech/six/urd43gx/code/filtering_crawl/compute_stats_langid/stats_langid.json",
32 | "w",
33 | ) # "myfile.json"
34 | json.dump(stats_langid, f)
35 | f.close()
36 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/language_annotation/slurm_scripts/02_detect_html_lang_attrib.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_extract_lang_tag
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=40 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/extract_lang_tag_V5/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=35,341,297
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
19 |
20 | DATASET_PATH=/gpfsscratch/rech/six/urd43gx/crawl/shards/shard_"$SLURM_ARRAY_TASK_ID"
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/fasttext_annotation/seeds_batch_1/datasets-lang-annotation/bigscience-catalogue-data
22 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/seed_id="$SLURM_ARRAY_TASK_ID"
23 | echo $DATASET_PATH
24 | pushd $DATA_TOOLING_REPO
25 |
26 | mkdir -p $SAVE_DATASET_DIR
27 |
28 | export HF_DATASETS_OFFLINE=1
29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
30 |
31 | python -m cc_pseudo_crawl.language_annotation.python_scripts.detect_html_lang_attrib \
32 | --dataset-path $DATASET_PATH \
33 | --num-proc 40 \
34 | --save-path $SAVE_DATASET_PATH \
35 | --use-datasets-caching
36 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/language_annotation/slurm_scripts/job_annotate_langid_crawl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=six@cpu
3 | #SBATCH --job-name=annotate_langid_crawl
4 | #SBATCH --partition=cpu_p1
5 | #SBATCH --cpus-per-task=1
6 | #SBATCH --output=res%A_%a
7 | #SBATCH --time=20:00:00
8 |
9 | echo "Running job on $hostname"
10 |
11 | # load conda environment
12 | source $six_ALL_CCFRWORK/start-prod
13 | conda activate hugo
14 |
15 | python /gpfswork/rech/six/urd43gx/code/filtering_crawl/annotate_langid_crawl/annotate_langid_crawl.py ${SLURM_ARRAY_TASK_ID}
16 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/python_scripts/check_erros_in_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | from argparse import ArgumentParser
4 |
5 | from datasets import load_from_disk
6 | from datasets.utils.logging import set_verbosity_info
7 |
8 | set_verbosity_info()
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | def get_args():
13 | parser = ArgumentParser()
14 | parser.add_argument("--dataset-dir", type=str, required=True, help="Dataset name.")
15 |
16 | args = parser.parse_args()
17 | return args
18 |
19 |
20 | def main():
21 | # Setup logging
22 | logging.basicConfig(
23 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
24 | datefmt="%m/%d/%Y %H:%M:%S",
25 | level=logging.INFO,
26 | )
27 | args = get_args()
28 | logger.info(
29 | f"** The job is runned with the following arguments: **\n{args}\n **** "
30 | )
31 |
32 | for dataset_name in os.listdir(args.dataset_dir):
33 | dataset_path = os.path.join(args.dataset_dir, dataset_name)
34 | try:
35 | logging.info(f"Processing: {dataset_path}")
36 | ds = load_from_disk(dataset_path)
37 | new_ds = ds.filter(keep_failed_examples)
38 | logging.info(f"Here's the subset of failed downloads: {new_ds}")
39 | except Exception as e:
40 | logging.warning(f"Failed to process {dataset_path} with error '{str(e)}'")
41 |
42 |
43 | def keep_failed_examples(example):
44 | if example["download_exception"] is None:
45 | return False
46 | return True
47 |
48 |
49 | if __name__ == "__main__":
50 | main()
51 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/python_scripts/divide_in_shards.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import subprocess
4 | from argparse import ArgumentParser
5 | from pathlib import Path
6 | import sys
7 |
8 | from datasets import load_from_disk
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | # For `soup.decode_content` that can hit the limit
13 | sys.setrecursionlimit(10000)
14 |
15 |
16 | def get_args():
17 | parser = ArgumentParser()
18 | parser.add_argument(
19 | "--dataset-path",
20 | type=str,
21 | required=True,
22 | help="path to the parquet dataset folder",
23 | )
24 | parser.add_argument("--save-dir", type=str, help="Where to save the datasets.")
25 | parser.add_argument("--num-shards", type=int, help="Total number of shards.")
26 | args = parser.parse_args()
27 |
28 | return args
29 |
30 |
31 | def main():
32 | # Setup logging
33 | logging.basicConfig(
34 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
35 | datefmt="%m/%d/%Y %H:%M:%S",
36 | level=logging.INFO,
37 | )
38 | args = get_args()
39 | logger.info(
40 | f"** The job is runned with the following arguments: **\n{args}\n **** "
41 | )
42 |
43 | ds = load_from_disk(args.dataset_path)
44 |
45 | dataset_path = Path(args.dataset_path)
46 |
47 | for shard_id in range(args.num_shards):
48 | file_name_init = dataset_path.name
49 | dataset_name, shard_id_init, num_shards_init = file_name_init.split("--")
50 |
51 | shard_id_new = int(shard_id_init) * args.num_shards + shard_id
52 | total_num_shard = int(num_shards_init) * args.num_shards
53 | shard_name = f"{dataset_name}--{shard_id_new}--{total_num_shard}"
54 | save_path = Path(os.path.join(args.save_dir, shard_name))
55 | sub_ds = ds.shard(num_shards=args.num_shards, index=shard_id)
56 |
57 | save_path_tmp = f"{str(save_path.absolute())}.tmp"
58 | logger.info(f"Saving the dataset at {save_path_tmp}")
59 | sub_ds.save_to_disk(save_path_tmp)
60 | logger.info(f"Moving the saved dataset to {str(save_path.absolute())}")
61 | subprocess.run(["mv", save_path_tmp, str(save_path.absolute())])
62 |
63 |
64 | if __name__ == "__main__":
65 | main()
66 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/python_scripts/extract_text/requirements.txt:
--------------------------------------------------------------------------------
1 | git+git@github.com:bigscience-workshop/metadata.git
2 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from argparse import ArgumentParser
3 |
4 |
5 | def get_args():
6 | parser = ArgumentParser()
7 | parser.add_argument(
8 | "--seed-paths",
9 | type=lambda x: x.split(","),
10 | required=True,
11 | help="Seed full path. e.g. 'xxx/seeds.csv'",
12 | )
13 | parser.add_argument("--seed-index", type=int, required=True, help="Seed index.")
14 | args = parser.parse_args()
15 |
16 | return args
17 |
18 |
19 | def main():
20 | args = get_args()
21 |
22 | seed_ids = []
23 | for seed_path in args.seed_paths:
24 | with open(seed_path, "r") as fi:
25 | data = csv.reader(fi)
26 | # First line is all the headers that we remove.
27 | seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
28 | print(seed_ids[args.seed_index])
29 |
30 |
31 | if __name__ == "__main__":
32 | main()
33 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/python_scripts/redownload_warc.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from argparse import ArgumentParser
3 | from pathlib import Path
4 |
5 | import datasets
6 | from datasets import config, load_from_disk
7 | from datasets.utils.logging import set_verbosity_info
8 |
9 | from .download_warc import download_warcs
10 |
11 | set_verbosity_info()
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def get_args():
16 | parser = ArgumentParser()
17 | parser.add_argument("--dataset-path", type=str, required=True, help="Dataset name.")
18 | parser.add_argument("--num-proc", type=int, required=True, help="Dataset name.")
19 | parser.add_argument("--save-path", type=str, help="Where to save the datasets.")
20 | parser.add_argument("--use-datasets-caching", action="store_true")
21 |
22 | args = parser.parse_args()
23 | return args
24 |
25 |
26 | def main():
27 | # Setup logging
28 | logging.basicConfig(
29 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
30 | datefmt="%m/%d/%Y %H:%M:%S",
31 | level=logging.INFO,
32 | )
33 | args = get_args()
34 | logger.info(
35 | f"** The job is runned with the following arguments: **\n{args}\n **** "
36 | )
37 |
38 | if not args.use_datasets_caching:
39 | datasets.set_caching_enabled(False)
40 | else:
41 | logger.info(
42 | f"the datasets results will be cached at {config.HF_DATASETS_CACHE}."
43 | )
44 |
45 | ds = load_from_disk(args.dataset_path)
46 |
47 | if args.save_path:
48 | save_path = Path(args.save_path)
49 | else:
50 | save_path = Path(args.dataset_path)
51 |
52 | download_warcs(ds, save_path, num_proc=args.num_proc)
53 |
54 |
55 | if __name__ == "__main__":
56 | main()
57 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/python_scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | bs4
3 | datasets
4 | pyathena
5 | surt
6 | tldextract
7 | warcio
8 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/.gitignore:
--------------------------------------------------------------------------------
1 | sourcing_sheet_seeds/seeds.gz.parquet
2 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/DEPTH.md:
--------------------------------------------------------------------------------
1 | ## Strategy to get depth 1
2 |
3 | ### Context
4 |
5 | Once we've extract all the seed pages, we plan to make a pseudo crawl. The idea is simple:
6 | - we extract the outgoing urls from those pages.
7 | - we find the most recent record in CC matching that url (if it exists).
8 | - we do the entire processing for all the new records.pages
9 | - we update `outgoing_urls` to obtain `outgoing_ids`
10 |
11 | ### Process
12 |
13 | - 1) Make Athena query
14 | - 2) Preprocess dataset to: load_warc, obtain pdf_urls, extract external_urls
15 | - 3) Build new query with all `external_urls`
16 | - 4) Repeat 1-3 until reaching the depth we want.
17 | - 5) Finalise `finalise.py` to: generate ids, generate `external_ids` that map to rows inside dataset.
18 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/check_errors_in_dataset.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_check_erros_in_dataset
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=prepost
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/re_dowload/%x-%j.out # output file name
10 | #SBATCH --account=six@cpu
11 |
12 | set -x -e
13 |
14 | source $six_ALL_CCFRWORK/start-prod
15 | conda activate thomas_data_tooling # Debug deepspeed temporarily
16 |
17 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
18 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-redownload/bigscience-catalogue-data
19 | echo $DATASET_DIR
20 | pushd $DATA_TOOLING_REPO
21 |
22 |
23 | export HF_DATASETS_OFFLINE=1
24 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
25 |
26 | python -m cc_pseudo_crawl.python_scripts.check_erros_in_dataset \
27 | --dataset-dir $DATASET_DIR
28 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_divide_in_subshards
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --hint=nomultithread # we get physical cores not logical
6 | #SBATCH --partition=cpu_p1
7 | #SBATCH --cpus-per-task=4 # number of cores per tasks
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/preprocess/%x-%j.out # output file name
10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-redownload/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-divide-in-subshards/bigscience-catalogue-data
22 | echo $DATASET_PATH
23 | pushd $DATA_TOOLING_REPO
24 |
25 | mkdir -p $SAVE_DATASET_DIR
26 |
27 | export HF_DATASETS_OFFLINE=1
28 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
29 |
30 | python -m cc_pseudo_crawl.python_scripts.divide_in_shards \
31 | --dataset-path $DATASET_PATH \
32 | --save-dir $SAVE_DATASET_DIR \
33 | --num-shards 10
34 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/divide_in_subshards_1000.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_divide_in_subshards
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --hint=nomultithread # we get physical cores not logical
6 | #SBATCH --partition=compil
7 | #SBATCH --cpus-per-task=4 # number of cores per tasks
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=logs/%x-%j.out # output file name
10 | #SBATCH --array=0-99 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed-text-extracted/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed-text-extracted/bigscience-catalogue-data
22 | echo $DATASET_PATH
23 | pushd $DATA_TOOLING_REPO
24 |
25 | mkdir -p $SAVE_DATASET_DIR
26 |
27 | export HF_DATASETS_OFFLINE=1
28 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
29 |
30 | python -m cc_pseudo_crawl.python_scripts.divide_in_shards \
31 | --dataset-path $DATASET_PATH \
32 | --save-dir $SAVE_DATASET_DIR \
33 | --num-shards 10
34 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_download
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=prepost
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=logs/%x-%j.out # output file name
10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
19 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets
20 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
21 | pushd $DATA_TOOLING_REPO
22 |
23 | # TODO run this offline
24 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/
25 |
26 | python cc_pseudo_crawl/python_scripts/download_warc.py \
27 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \
28 | --cc-index-folder $CC_INDEX_FOLDER \
29 | --save-dir $SAVE_DATASET_DIR \
30 | --num-proc 4 \
31 | --shard-id $SLURM_ARRAY_TASK_ID \
32 | --num-shards $SLURM_ARRAY_TASK_COUNT
33 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_too_big.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_download_failed_shards
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=compil
8 | #SBATCH --time 14:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/download_failed_shards/%x-%j.out # output file name
10 | #SBATCH --array=3,9 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
19 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets
20 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
21 | pushd $DATA_TOOLING_REPO
22 |
23 | # TODO run this offline
24 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/
25 |
26 | export HF_DATASETS_OFFLINE=1
27 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
28 |
29 | python cc_pseudo_crawl/python_scripts/download_warc.py \
30 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \
31 | --cc-index-folder $CC_INDEX_FOLDER \
32 | --save-dir $SAVE_DATASET_DIR \
33 | --num-proc 4 \
34 | --shard-id $SLURM_ARRAY_TASK_ID \
35 | --num-shards 10 \
36 | --use-datasets-caching
37 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_4.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_download_trial_4
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=prepost
8 | #SBATCH --time 15:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/download_trial_4/%x-%j.out # output file name
10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | export DATASETS_VERBOSITY=info
19 |
20 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets
22 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
23 | pushd $DATA_TOOLING_REPO
24 |
25 | # TODO run this offline
26 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/
27 |
28 | python cc_pseudo_crawl/python_scripts/download_warc.py \
29 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \
30 | --cc-index-folder $CC_INDEX_FOLDER \
31 | --save-dir $SAVE_DATASET_DIR \
32 | --num-proc 4 \
33 | --shard-id $SLURM_ARRAY_TASK_ID \
34 | --num-shards $SLURM_ARRAY_TASK_COUNT
35 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/download_warc_trial_5.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_download_trial_7
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=compil
8 | #SBATCH --time 15:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/download_trial_5/%x-%j.out # output file name
10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | export DATASETS_VERBOSITY=info
19 |
20 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets
22 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
23 | pushd $DATA_TOOLING_REPO
24 |
25 | # TODO run this offline
26 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/
27 |
28 | python cc_pseudo_crawl/python_scripts/download_warc.py \
29 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \
30 | --cc-index-folder $CC_INDEX_FOLDER \
31 | --save-dir $SAVE_DATASET_DIR \
32 | --num-proc 4 \
33 | --shard-id $SLURM_ARRAY_TASK_ID \
34 | --num-shards $SLURM_ARRAY_TASK_COUNT
35 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/merge_seed_shards.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_merge_seed_shards
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=logs/merge_seed_shards/%x-%j.out # output file name
10 | #SBATCH --array=0-604 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --mail-type=ALL
12 | #SBATCH --mail-user=thomas.wang@huggingface.co
13 | #SBATCH --account=six@cpu
14 |
15 |
16 | set -x -e
17 |
18 | source $six_ALL_CCFRWORK/start-prod
19 | conda activate thomas_data_tooling # Debug deepspeed temporarily
20 |
21 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
22 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
23 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-shard-by-seed-id/bigscience-catalogue-data
24 | pushd $DATA_TOOLING_REPO
25 |
26 | SEED_ID=$(python cc_pseudo_crawl/seeds_batch_1/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/ --seed-index $SLURM_ARRAY_TASK_ID)
27 | echo "Merging all shards of seed id ${SEED_ID}"
28 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
29 |
30 | export HF_DATASETS_OFFLINE=1
31 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
32 |
33 | python cc_pseudo_crawl/python_scripts/merge_seed_shards.py \
34 | --dataset-dir $DATASET_DIR \
35 | --seed-id $SEED_ID \
36 | --save-path $SAVE_DATASET_PATH
37 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/preprocess_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_preprocess_v4
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=40 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/preprocess-on-subshards/%x-%j.out # output file name
10 | #SBATCH --array=0-99 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-divide-in-subshards/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed/bigscience-catalogue-data
22 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
23 | echo $DATASET_PATH
24 | pushd $DATA_TOOLING_REPO
25 |
26 | mkdir -p $SAVE_DATASET_DIR
27 |
28 | export HF_DATASETS_OFFLINE=1
29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
30 |
31 | python -m cc_pseudo_crawl.python_scripts.preprocess_dataset \
32 | --dataset-path $DATASET_PATH \
33 | --num-proc 80 \
34 | --save-path $SAVE_DATASET_PATH \
35 | --use-datasets-caching \
36 | --flavor seed
37 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/redownload_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_redownload
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=prepost
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/re_dowload/%x-%j.out # output file name
10 | #SBATCH --array=0-9 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling # Debug deepspeed temporarily
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-redownload/bigscience-catalogue-data
22 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
23 | echo $DATASET_PATH
24 | pushd $DATA_TOOLING_REPO
25 |
26 | mkdir -p $SAVE_DATASET_DIR
27 |
28 | export HF_DATASETS_OFFLINE=1
29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
30 |
31 | python -m cc_pseudo_crawl.python_scripts.redownload_warc \
32 | --dataset-path $DATASET_PATH \
33 | --num-proc 4 \
34 | --save-path $SAVE_DATASET_PATH \
35 | --use-datasets-caching
36 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_and_compress.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_shard_and_compress
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=40 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=logs/fix_compress_cpu_p1_2/%x-%j.out # output file name
10 | #SBATCH --array=0-604 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --mail-type=ALL
12 | #SBATCH --account=six@cpu
13 |
14 | set -x -e
15 |
16 | source $six_ALL_CCFRWORK/start-prod
17 | conda activate thomas_data_tooling # Debug deepspeed temporarily
18 |
19 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
20 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
21 |
22 | pushd $DATA_TOOLING_REPO
23 |
24 | SEED_ID=$(python cc_pseudo_crawl/seeds_batch_1/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/ --seed-index $SLURM_ARRAY_TASK_ID)
25 | # SEED_IDS=(
26 | # 689
27 | # 510
28 | # 550
29 | # )
30 | # SEED_ID=${SEED_IDS[$SLURM_ARRAY_TASK_ID]}
31 |
32 | # NODES_PER_SEED=15
33 |
34 | # INDEX_SLICE=$(python -c "print($SLURM_ARRAY_TASK_ID % $NODES_PER_SEED)")
35 | # SEED_ID=${SEED_IDS[$(python -c "print($SLURM_ARRAY_TASK_ID // $NODES_PER_SEED)")]}
36 |
37 | echo "Sharding and compressing seed id ${SEED_ID}"
38 |
39 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
40 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-compressed-shards/bigscience-catalogue-data/seed_id="$SEED_ID"
41 |
42 | mkdir -p $SAVE_DATASET_PATH
43 |
44 | export HF_DATASETS_OFFLINE=1
45 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
46 |
47 | python cc_pseudo_crawl/python_scripts/shard_and_compress.py \
48 | --dataset-path $DATASET_PATH \
49 | --max-size 10_000_000_000 \
50 | --num-proc 4 \
51 | --save-num-proc 10 \
52 | --save-path $SAVE_DATASET_PATH \
53 | --save-batch-size 10
54 | # --index-slice $INDEX_SLICE
55 | # --total-number-slice $NODES_PER_SEED
56 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/slurm_scripts/shard_by_seed_id.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_shard_by_id
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 04:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=logs/%x-%j.out # output file name
10 | #SBATCH --array=0-999 # TODO: modify according to the number of models you want to evaluated
11 | #SBATCH --mail-type=ALL
12 | #SBATCH --mail-user=thomas.wang@huggingface.co
13 | #SBATCH --account=six@cpu
14 |
15 | set -x -e
16 |
17 | source $six_ALL_CCFRWORK/start-prod
18 | conda activate thomas_data_tooling # Debug deepspeed temporarily
19 |
20 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/cc
21 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
22 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-preprocessed-text-extracted/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--1000
23 | SAVE_DATASET_PREFIX_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1/datasets-shard-by-seed-id/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--1000
24 | pushd $DATA_TOOLING_REPO
25 |
26 | mkdir -p $(dirname $SAVE_DATASET_PREFIX_PATH)
27 |
28 | export HF_DATASETS_OFFLINE=1
29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
30 |
31 | python cc_pseudo_crawl/python_scripts/shard_by_seed_id.py \
32 | --dataset-path $DATASET_PATH \
33 | --num-proc 4 \
34 | --save-prefix-path $SAVE_DATASET_PREFIX_PATH
35 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/README.md:
--------------------------------------------------------------------------------
1 | # Pseudo-Crawl Data Sourcing Candidate Seeds Spreadsheet
2 |
3 | Source: https://docs.google.com/spreadsheets/d/1DNLAGz--qvLh-0qQ7pMPGiNeUMgp-fRgn-8mbLagC7U/edit#gid=513216703 (timestamp 2021-11-28, reverted edits by anonymous user on record 16 - diariovasco.com), exported as [candidate_websites_for_crawling.csv](./candidate_websites_for_crawling.csv)
4 |
5 | Steps:
6 |
7 | 1. run [cleanup-seeds](./cleanup-seeds.ipynb) to prepare a clean seed list
8 |
9 | 2. do the lookups / table join, see [general instructions](../README.md) using the crawl selector `CC-MAIN-202[01]` to restrict the join for the last 2 years
10 |
11 | 3. prepare [coverage metrics](./cc-metrics.ipynb)
12 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1_2/00_clean_dataset.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_clean_dataset
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=40 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_1_2/logs/clean_dataset-v2/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-613
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
19 |
20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl
21 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1_2/datasets-clean/bigscience-catalogue-data
22 | echo $DATASET_PATH
23 | pushd $DATA_TOOLING_REPO
24 |
25 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py \
26 | --seed-paths "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/seeds.csv,"$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv \
27 | --seed-index $SLURM_ARRAY_TASK_ID \
28 | )
29 |
30 | mkdir -p $SAVE_DATASET_DIR
31 |
32 | export HF_DATASETS_OFFLINE=1
33 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
34 |
35 | python cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py \
36 | --seed-id $SEED_ID \
37 | --save-dir $SAVE_DATASET_DIR \
38 | --pseudo_crawl_path $DATASET_PATH \
39 | --batch-size 10 \
40 | --save-batch-size 10 \
41 | --num-proc 10 \
42 | --min-chars 32 \
43 | --n-records 10000 \
44 | --pourcentage-threshold 0.01 \
45 | --min-repetition-threshold 10
46 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_1_2/01_exact_deduplicates.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_deduplicate
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 2:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_1_2/logs/deduplicate-on-clean-v2/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-613
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
19 |
20 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1_2/datasets-clean/bigscience-catalogue-data
21 |
22 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py \
23 | --seed-paths "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_1/sourcing_sheet_seeds/seeds.csv,"$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv \
24 | --seed-index $SLURM_ARRAY_TASK_ID \
25 | )
26 |
27 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_1_2/datasets-deduplicate-on-clean-v2/bigscience-catalogue-data/lm_change_lang_id_seed_id_${SEED_ID}_pseudocrawl_change_name
28 | echo $DATASET_PATH
29 | pushd $DATA_TOOLING_REPO
30 |
31 | mkdir -p $SAVE_DATASET_DIR
32 |
33 | export HF_DATASETS_OFFLINE=1
34 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
35 |
36 | python cc_pseudo_crawl/python_scripts/exact_deduplicates.py \
37 | --seed-id $SEED_ID \
38 | --save-dir $SAVE_DATASET_DIR \
39 | --pseudo_crawl_path $DATASET_PATH \
40 | --batch-size 1000 \
41 | --save-batch-size 1000 \
42 | --num-proc 8
43 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/.gitignore:
--------------------------------------------------------------------------------
1 | sourcing_sheet_seeds/seeds_batch_2.gz.parquet
2 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/README.md:
--------------------------------------------------------------------------------
1 | # Extracting Content from Common Crawl for 2nd Curated List of Sites
2 |
3 | This folder gathers the scripts necessary to create the extension of the pseudo crawl dataset with the new seeds:
4 | - https://www.bbc.com/swahili
5 | - https://www.bbc.com/gahuza
6 | - https://www.bbc.com/igbo
7 | - https://www.bbc.com/yoruba
8 | - https://yo.globalvoices.org
9 | - https://ig.globalvoices.org
10 | - https://www.dw.com/sw
11 | - https://www.mwananchi.co.tz
12 | - https://www.voaswahili.com
13 | - https://www.voaswahili.com/
14 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/01_download_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_download_v1
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=prepost
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-99 #TODO set correct number
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc
19 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets
20 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
21 | pushd $DATA_TOOLING_REPO
22 |
23 | mkdir -p $SAVE_DATASET_DIR
24 |
25 | # TODO run this offline
26 | # aws s3 sync s3://commoncrawl-dev/big-science-workshop/data-sourcing-sheet/cc-{FLAVOR}/ $CC_INDEX_FOLDER/cc-{FLAVOR}/
27 |
28 | export HF_DATASETS_OFFLINE=1
29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
30 |
31 | python cc_pseudo_crawl/python_scripts/download_warc.py \
32 | --dataset bigscience-catalogue-data/pseudo_crawl_seed \
33 | --cc-index-folder $CC_INDEX_FOLDER \
34 | --save-dir $SAVE_DATASET_DIR \
35 | --num-proc 4 \
36 | --shard-id $SLURM_ARRAY_TASK_ID \
37 | --num-shards $SLURM_ARRAY_TASK_COUNT \
38 | --flavor "seeds_batch_2" \
39 | --use-datasets-caching
40 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02_redownload_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_redownload
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=compil
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=81-99%5
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc
19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
20 |
21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data
23 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
24 | echo $DATASET_PATH
25 | pushd $DATA_TOOLING_REPO
26 |
27 | mkdir -p $SAVE_DATASET_DIR
28 |
29 | export HF_DATASETS_OFFLINE=1
30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
31 |
32 | python -m cc_pseudo_crawl.python_scripts.redownload_warc \
33 | --dataset-path $DATASET_PATH \
34 | --num-proc 1 \
35 | --save-path $SAVE_DATASET_PATH \
36 | --use-datasets-caching
37 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/02b_redownload_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_redownload
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=compil
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/02b/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-99%5
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc
19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling
20 |
21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data
23 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
24 | echo $DATASET_PATH
25 | pushd $DATA_TOOLING_REPO
26 |
27 | mkdir -p $SAVE_DATASET_DIR
28 |
29 | export HF_DATASETS_OFFLINE=1
30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
31 |
32 | python -m cc_pseudo_crawl.python_scripts.redownload_warc \
33 | --dataset-path $DATASET_PATH \
34 | --num-proc 1 \
35 | --save-path $SAVE_DATASET_PATH \
36 | --use-datasets-caching
37 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/03_check_errors_in_dataset.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_check_erros_in_dataset
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --account=six@cpu
11 |
12 | set -x -e
13 |
14 | source $six_ALL_CCFRWORK/start-prod
15 | conda activate thomas_data_tooling
16 |
17 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling #TODO change path if necessary
18 |
19 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data
20 | echo $DATASET_DIR
21 | pushd $DATA_TOOLING_REPO
22 |
23 |
24 | export HF_DATASETS_OFFLINE=1
25 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
26 |
27 | python -m cc_pseudo_crawl.python_scripts.check_erros_in_dataset \
28 | --dataset-dir $DATASET_DIR
29 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/04_divide_in_subshards.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_divide_in_subshards
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --hint=nomultithread # we get physical cores not logical
6 | #SBATCH --partition=cpu_p1
7 | #SBATCH --cpus-per-task=4 # number of cores per tasks
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/divice_in_subshards/logs/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-99 #TODO set correct number
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc #TODO change path if necessary
19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling #TODO change path if necessary
20 |
21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-redownload/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-divide-in-subshards/bigscience-catalogue-data
23 | echo $DATASET_PATH
24 | pushd $DATA_TOOLING_REPO
25 |
26 | mkdir -p $SAVE_DATASET_DIR
27 |
28 | export HF_DATASETS_OFFLINE=1
29 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
30 |
31 | python -m cc_pseudo_crawl.python_scripts.divide_in_shards \
32 | --dataset-path $DATASET_PATH \
33 | --save-dir $SAVE_DATASET_DIR \
34 | --num-shards 10
35 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/05_preprocess_warc.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_preprocess
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=40 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/preprocess_warc/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-99 #TODO set correct number
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc #TODO change path if necessary
19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling #TODO change path if necessary
20 |
21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-divide-in-subshards/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
22 | SAVE_DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-preprocessed/bigscience-catalogue-data
23 | SAVE_DATASET_PATH=$SAVE_DATASET_DIR/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--"$SLURM_ARRAY_TASK_COUNT"
24 | echo $DATASET_PATH
25 | pushd $DATA_TOOLING_REPO
26 |
27 | mkdir -p $SAVE_DATASET_DIR
28 |
29 | export HF_DATASETS_OFFLINE=1
30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
31 |
32 | python -m cc_pseudo_crawl.python_scripts.preprocess_dataset \
33 | --dataset-path $DATASET_PATH \
34 | --num-proc 80 \
35 | --save-path $SAVE_DATASET_PATH \
36 | --use-datasets-caching \
37 | --flavor seed
38 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/07_shard_by_seed_id.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_shard_by_id
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 04:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/shard_by_seed_id/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-99 #TODO set correct number
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc
19 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
20 |
21 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-preprocessed-text-extracted/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
22 | SAVE_DATASET_PREFIX_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-shard-by-seed-id/bigscience-catalogue-data/pseudo_crawl_seed--"$SLURM_ARRAY_TASK_ID"--100
23 | pushd $DATA_TOOLING_REPO
24 |
25 | mkdir -p $(dirname $SAVE_DATASET_PREFIX_PATH)
26 |
27 | export HF_DATASETS_OFFLINE=1
28 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
29 |
30 | python cc_pseudo_crawl/python_scripts/shard_by_seed_id.py \
31 | --dataset-path $DATASET_PATH \
32 | --num-proc 4 \
33 | --save-prefix-path $SAVE_DATASET_PREFIX_PATH
34 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/08_merge_seed_shards.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_merge_seed_shards
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/merge_seed_shards/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-99 #TODO set correct number
11 | #SBATCH --account=six@cpu
12 |
13 |
14 | set -x -e
15 |
16 | source $six_ALL_CCFRWORK/start-prod
17 | conda activate thomas_data_tooling
18 |
19 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc
20 | DATA_TOOLING_REPO=$WORK/repos/sync_data_tooling/data_tooling
21 |
22 | DATASET_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-shard-by-seed-id/bigscience-catalogue-data
23 | pushd $DATA_TOOLING_REPO
24 |
25 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv --seed-index $SLURM_ARRAY_TASK_ID)
26 | echo "Merging all shards of seed id ${SEED_ID}"
27 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
28 |
29 | export HF_DATASETS_OFFLINE=1
30 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
31 |
32 | python cc_pseudo_crawl/python_scripts/merge_seed_shards.py \
33 | --dataset-dir $DATASET_DIR \
34 | --seed-id $SEED_ID \
35 | --save-path $SAVE_DATASET_PATH
36 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/09_shard_and_compress.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo_crawl_shard_and_compress
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=40 # number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --partition=cpu_p1
8 | #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
9 | #SBATCH --output=/gpfsscratch/rech/six/commun/pseudo_crawl/seeds_batch_2/logs/shard_and_compress/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --array=0-8
11 | #SBATCH --account=six@cpu
12 |
13 | set -x -e
14 |
15 | source $six_ALL_CCFRWORK/start-prod
16 | conda activate thomas_data_tooling
17 |
18 | CC_INDEX_FOLDER=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/cc #TODO change path if necessary
19 | DATA_TOOLING_REPO=$WORK/code/big_science/data_tooling #TODO change path if necessary
20 |
21 | pushd $DATA_TOOLING_REPO
22 |
23 | SEED_ID=$(python cc_pseudo_crawl/python_scripts/load_all_seed_ids.py --seed-path "$DATA_TOOLING_REPO"/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv --seed-index $SLURM_ARRAY_TASK_ID)
24 | # SEED_IDS=(
25 | # 689
26 | # 510
27 | # 550
28 | # )
29 | # SEED_ID=${SEED_IDS[$SLURM_ARRAY_TASK_ID]}
30 |
31 | # NODES_PER_SEED=15
32 |
33 | # INDEX_SLICE=$(python -c "print($SLURM_ARRAY_TASK_ID % $NODES_PER_SEED)")
34 | # SEED_ID=${SEED_IDS[$(python -c "print($SLURM_ARRAY_TASK_ID // $NODES_PER_SEED)")]}
35 |
36 | echo "Sharding and compressing seed id ${SEED_ID}"
37 |
38 | DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
39 | SAVE_DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/seed_id="$SEED_ID"
40 |
41 | mkdir -p $SAVE_DATASET_PATH
42 |
43 | export HF_DATASETS_OFFLINE=1
44 | export HF_DATASETS_CACHE=$SCRATCH/to_delete
45 |
46 | python cc_pseudo_crawl/python_scripts/shard_and_compress.py \
47 | --dataset-path $DATASET_PATH \
48 | --max-size 10_000_000_000 \
49 | --num-proc 4 \
50 | --save-num-proc 10 \
51 | --save-path $SAVE_DATASET_PATH \
52 | --save-batch-size 10
53 | # --index-slice $INDEX_SLICE
54 | # --total-number-slice $NODES_PER_SEED
55 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/slurm_scripts/10_push_to_hub.slurm:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=pseudo-crawl-push-to-hub # (change me!) job name
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
5 | #SBATCH --cpus-per-task=4 # (change me! between 0 and 48) number of cores per tasks
6 | #SBATCH --hint=nomultithread # we get physical cores not logical
7 | #SBATCH --time 10:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
8 | #SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
9 | #SBATCH --output=logs/%x-%j.out # output file name #TODO change path if necessary
10 | #SBATCH --account=six@cpu # account
11 | #SBATCH -p compil # partition with internet
12 |
13 | set -x -e
14 |
15 | source $HOME/start-modelling-metadata-user
16 |
17 | # mv $six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/* $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/ not used
18 |
19 | rsync -vam -f'+ *.jsonl.gz' -f'+ */' -f'- *' $six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/* $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/
20 |
21 | # ls $six_ALL_CCFRSCRATCH/pseudo_crawl/seeds_batch_2/datasets-compressed-shards/bigscience-catalogue-data/ -f'+ *.jsonl.gz' | xargs -n1 -P8 -I% rsync -Pa % $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/ not used
22 |
23 |
24 | cd $six_ALL_CCFRSCRATCH/pseudo_crawl/hub/pseudo_crawl/
25 |
26 | git status
27 |
28 | for seed_id in {698..708}
29 | do
30 | echo "Add seed id n°$seed_id"
31 | git add -v *seed_id="$seed_id"/*.gz
32 | done
33 |
34 | git commit -v -m "add depth 0 dataset with html content extracted"
35 | git push -v
36 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds.csv:
--------------------------------------------------------------------------------
1 | id
2 | 698
3 | 699
4 | 700
5 | 701
6 | 702
7 | 703
8 | 704
9 | 705
10 | 706
11 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.csv:
--------------------------------------------------------------------------------
1 | uid,type,description,link,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey,id
2 | bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili,/swahili,www.bbc.com,bbc.com,"com,bbc)/swahili",698
3 | bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza,/gahuza,www.bbc.com,bbc.com,"com,bbc)/gahuza",699
4 | bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo,/igbo,www.bbc.com,bbc.com,"com,bbc)/igbo",700
5 | bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba,/yoruba,www.bbc.com,bbc.com,"com,bbc)/yoruba",701
6 | global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org/,/,yo.globalvoices.org,globalvoices.org,"org,globalvoices,yo)/",702
7 | global_voices_igbo,primary,{'homepage': 'https://ig.globalvoices.org'},https://ig.globalvoices.org/,/,ig.globalvoices.org,globalvoices.org,"org,globalvoices,ig)/",703
8 | dw_swahili,primary,{'homepage': 'https://www.dw.com/sw'},https://www.dw.com/sw,/sw,www.dw.com,dw.com,"com,dw)/sw",704
9 | mwananchi_,primary,{'homepage': 'https://www.mwananchi.co.tz'},https://www.mwananchi.co.tz/,/,www.mwananchi.co.tz,mwananchi.co.tz,"tz,co,mwananchi)/",705
10 | voa_swahili,primary,{'homepage': 'https://www.voaswahili.com'},https://www.voaswahili.com/,/,www.voaswahili.com,voaswahili.com,"com,voaswahili)/",706
11 |
--------------------------------------------------------------------------------
/cc_pseudo_crawl/seeds_batch_2/sourcing_sheet_seeds/seeds_batch_2.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "uid": "bbc_swahili",
4 | "type": "primary",
5 | "description": {
6 | "homepage": "https://www.bbc.com/swahili"
7 | }
8 | },
9 | {
10 | "uid": "bbc_gahuza",
11 | "type": "primary",
12 | "description": {
13 | "homepage": "https://www.bbc.com/gahuza"
14 | }
15 | },
16 | {
17 | "uid": "bbc_igbo",
18 | "type": "primary",
19 | "description": {
20 | "homepage": "https://www.bbc.com/igbo"
21 | }
22 | },
23 | {
24 | "uid": "bbc_yoruba",
25 | "type": "primary",
26 | "description": {
27 | "homepage": "https://www.bbc.com/yoruba"
28 | }
29 | },
30 | {
31 | "uid": "global_voices_yoruba",
32 | "type": "primary",
33 | "description": {
34 | "homepage": "https://yo.globalvoices.org"
35 | }
36 | },
37 | {
38 | "uid": "global_voices_igbo",
39 | "type": "primary",
40 | "description": {
41 | "homepage": "https://ig.globalvoices.org"
42 | }
43 | },
44 | {
45 | "uid": "dw_swahili",
46 | "type": "primary",
47 | "description": {
48 | "homepage": "https://www.dw.com/sw"
49 | }
50 | },
51 | {
52 | "uid": "mwananchi_",
53 | "type": "primary",
54 | "description": {
55 | "homepage": "https://www.mwananchi.co.tz"
56 | }
57 | },
58 | {
59 | "uid": "voa_swahili",
60 | "type": "primary",
61 | "description": {
62 | "homepage": "https://www.voaswahili.com"
63 | }
64 | }
65 |
66 | ]
67 |
--------------------------------------------------------------------------------
/index_search/README.md:
--------------------------------------------------------------------------------
1 | # Elasticsearch index search experiments
2 |
3 | Early tests to build upon HuggingFace datasets to improving indexing/Search capabilities.
4 |
5 | ## Pre-requisites
6 |
7 | Elasticsearch is launched in cluster through docker so go install Docker if not already done: https://docs.docker.com/get-docker/
8 |
9 | The example is based on a forked version of dataset and some additional dependencies. Use `requirements.txt` to install all the necessary stuff. A conda en
10 |
11 | ## Run
12 |
13 | * Go into the `index_search` folder and start Elasticsearch cluster
14 |
15 | ```
16 | cd ./index_search
17 | docker compose up
18 | ```
19 |
20 | * Run the python script
21 |
22 | ```
23 | python datasets_index_search.py
24 | ```
25 |
26 | Note that it will start a ray instance which might require some ports to be open for local communication.
27 |
28 | ## TODO list
29 |
30 | Improve datasets indexing capabilities
31 | - [x] test switch to ngram indexing
32 | - [x] add hash for each rows
33 | - [x] parallel processing using ray and dataset shards
34 | - [x] enable re-connection to existing index in ES
35 | - [x] enable continuing indexing process
36 | - [x] ensure no duplicate with mmh3 hash
37 | - [x] instantiate datasets from elasticsearch query
38 | - [x] clear cache when instantiating with new query
39 | - [ ] validate dataset info are propagated
40 | - [ ] check scalability
41 | - ~~allow export of search results in arrow for datasets or jsonl for export => specialized filter operation?~~
42 | - [ ] secure elasticsearch cluster: free read, protected write
43 | - [x] allow update on the dataset to be reflected with index update
44 |
--------------------------------------------------------------------------------
/index_search/datasets_ES_builder.py:
--------------------------------------------------------------------------------
1 | import simplejson as json
2 | from datasets.packaged_modules.elasticsearch.elasticsearch import ElasticsearchBuilder
3 |
4 | ca_file = "/Users/gdupont/src/github.com/bigscience-workshop/data-tooling/index_search/ca.cert"
5 | with open(
6 | "/Users/gdupont/src/github.com/bigscience-workshop/data-tooling/index_search/credentials.json"
7 | ) as f:
8 | credentials = json.load(f)
9 |
10 | the_host = credentials["connection"]["https"]["hosts"][0]["hostname"]
11 | the_port = credentials["connection"]["https"]["hosts"][0]["port"]
12 |
13 | username = credentials["connection"]["https"]["authentication"]["username"]
14 | psw = credentials["connection"]["https"]["authentication"]["password"]
15 |
16 | index_name = "oscar_unshuffled_deduplicated"
17 | oscar_lang_code = "nn"
18 |
19 | elasticsearch_builder = ElasticsearchBuilder(
20 | host=the_host,
21 | port=the_port,
22 | es_username=username,
23 | es_psw=psw,
24 | ca_file=ca_file,
25 | es_index_name=index_name,
26 | es_index_config=None,
27 | query="mykje arbeid og slit",
28 | )
29 |
30 | # elasticsearch_builder = ElasticsearchBuilder(
31 | # host="localhost",
32 | # port="9200",
33 | # es_index_name="oscar_unshuffled_deduplicated",
34 | # es_index_config=es_index_config,
35 | # query='"mykje arbeid og slit"'
36 | # )
37 |
38 | elasticsearch_builder.download_and_prepare()
39 |
40 | oscar_dataset_filtered = elasticsearch_builder.as_dataset()
41 | print(oscar_dataset_filtered.keys())
42 |
43 | first_split = next(iter(oscar_dataset_filtered))
44 |
45 | for i in range(0, 5):
46 | print(
47 | f"- [#{oscar_dataset_filtered[first_split]['id'][i]}] {oscar_dataset_filtered[first_split]['text'][i]}"
48 | )
49 |
--------------------------------------------------------------------------------
/index_search/datasets_ES_search.py:
--------------------------------------------------------------------------------
1 | import simplejson as json
2 | from datasets import load_dataset
3 |
4 | ca_file = "./ca.cert"
5 |
6 | with open("./credentials.json") as f:
7 | credentials = json.load(f)
8 |
9 | the_host = credentials["connection"]["https"]["hosts"][0]["hostname"]
10 | the_port = credentials["connection"]["https"]["hosts"][0]["port"]
11 |
12 | username = credentials["connection"]["https"]["authentication"]["username"]
13 | psw = credentials["connection"]["https"]["authentication"]["password"]
14 |
15 | index_name = "oscar_unshuffled_deduplicated"
16 | oscar_lang_code = "nn"
17 |
18 | my_dataset = load_dataset(
19 | "oscar", f"unshuffled_deduplicated_{oscar_lang_code}", split="train"
20 | )
21 |
22 | my_dataset.load_elasticsearch_index(
23 | index_name=index_name,
24 | host=the_host,
25 | port=the_port,
26 | es_username=username,
27 | es_psw=psw,
28 | ca_file=ca_file,
29 | es_index_name=index_name,
30 | es_index_config=None,
31 | )
32 |
33 | print(my_dataset)
34 |
35 | K = 10
36 | scores, retrieved = my_dataset.get_nearest_examples(
37 | index_name, "mykje arbeid og slit", k=K
38 | )
39 |
40 | for i in range(0, min(K, len(retrieved))):
41 | print(f"({i + 1})")
42 | print(f'\t@{scores[i]:.2f} - {retrieved["id"][i]} => {retrieved["text"][i]} \n')
43 |
--------------------------------------------------------------------------------
/index_search/datasets_remote_ES_IBMcloud.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import ssl
3 |
4 | import simplejson as json
5 | from elasticsearch import Elasticsearch
6 |
7 | with open("./credentials.json") as f:
8 | credentials = json.load(f)
9 |
10 | host = credentials["connection"]["https"]["hosts"][0]["hostname"]
11 | port = credentials["connection"]["https"]["hosts"][0]["port"]
12 |
13 | es_username = credentials["connection"]["https"]["authentication"]["username"]
14 | es_psw = credentials["connection"]["https"]["authentication"]["password"]
15 |
16 | ca_cert = base64.b64decode(
17 | credentials["connection"]["https"]["certificate"]["certificate_base64"]
18 | )
19 | # context = ssl.create_default_context()
20 | # context.verify_mode = ssl.CERT_REQUIRED
21 | # context.load_verify_locations(cadata=ca_cert)
22 |
23 | context = ssl.create_default_context(cafile="./ca.cert")
24 |
25 | server_url = (
26 | ("https" if context is not None else "http") + "://" + host + ":" + str(port)
27 | )
28 |
29 | es = Elasticsearch([server_url], http_auth=(es_username, es_psw), ssl_context=context)
30 |
31 | print(f"ES info {json.dumps(es.info(), indent=4 * ' ')}")
32 |
33 | # index_get_response = es.indices.get(index='oscar_unshuffled_deduplicated')
34 | # print(json.dumps(index_get_response, indent=4 * ' '))
35 |
36 | delete_response = es.indices.delete(index="oscar_unshuffled_deduplicated")
37 | print(json.dumps(delete_response, indent=4 * " "))
38 |
--------------------------------------------------------------------------------
/index_search/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2.2'
2 | services:
3 | es01:
4 | image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2
5 | container_name: es01
6 | environment:
7 | - node.name=es01
8 | - cluster.name=es-docker-cluster
9 | - discovery.seed_hosts=es02,es03
10 | - cluster.initial_master_nodes=es01,es02,es03
11 | - bootstrap.memory_lock=true
12 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
13 | ulimits:
14 | memlock:
15 | soft: -1
16 | hard: -1
17 | volumes:
18 | - data01:/usr/share/elasticsearch/data
19 | ports:
20 | - 9200:9200
21 | networks:
22 | - elastic
23 |
24 | es02:
25 | image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2
26 | container_name: es02
27 | environment:
28 | - node.name=es02
29 | - cluster.name=es-docker-cluster
30 | - discovery.seed_hosts=es01,es03
31 | - cluster.initial_master_nodes=es01,es02,es03
32 | - bootstrap.memory_lock=true
33 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
34 | ulimits:
35 | memlock:
36 | soft: -1
37 | hard: -1
38 | volumes:
39 | - data02:/usr/share/elasticsearch/data
40 | networks:
41 | - elastic
42 |
43 | es03:
44 | image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2
45 | container_name: es03
46 | environment:
47 | - node.name=es03
48 | - cluster.name=es-docker-cluster
49 | - discovery.seed_hosts=es01,es02
50 | - cluster.initial_master_nodes=es01,es02,es03
51 | - bootstrap.memory_lock=true
52 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
53 | ulimits:
54 | memlock:
55 | soft: -1
56 | hard: -1
57 | volumes:
58 | - data03:/usr/share/elasticsearch/data
59 | networks:
60 | - elastic
61 |
62 | kib01:
63 | image: docker.elastic.co/kibana/kibana:7.13.2
64 | container_name: kib01
65 | ports:
66 | - 5601:5601
67 | environment:
68 | ELASTICSEARCH_URL: http://es01:9200
69 | ELASTICSEARCH_HOSTS: '["http://es01:9200","http://es02:9200","http://es03:9200"]'
70 | networks:
71 | - elastic
72 |
73 | volumes:
74 | data01:
75 | driver: local
76 | data02:
77 | driver: local
78 | data03:
79 | driver: local
80 |
81 | networks:
82 | elastic:
83 | driver: bridge
84 |
--------------------------------------------------------------------------------
/index_search/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/ggdupont/datasets@bigscience_datatooling#egg=datasets
2 | elasticsearch==7.10.1
3 | iso-639==0.4.5
4 | ray~=1.4.1
5 | simplejson
6 |
--------------------------------------------------------------------------------
/kenlm_training/.gitignore:
--------------------------------------------------------------------------------
1 | # Dataset
2 | /data
3 | /test_data/
4 | /test_data2/
5 | /output/
6 |
7 | # Binary files
8 | /bin/
9 |
10 | # Third party code
11 | /third_party/
12 |
13 | # Generic to python
14 | __pycache__/
15 | *.pyc
16 | .mypy_cache/
17 |
18 | /scratch/
19 | /notebooks/
20 |
21 | /build/
22 | /cc_net.egg-info/
23 | /config/
24 | /dist/
25 | /pip-wheel-metadata/
26 |
27 | /.DS_Store
28 |
--------------------------------------------------------------------------------
/kenlm_training/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Facebook, Inc. and its affiliates.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/kenlm_training/cc_net/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 |
--------------------------------------------------------------------------------
/kenlm_training/cc_net/__main__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 |
7 |
8 | import func_argparse
9 |
10 | import cc_net.mine
11 |
12 |
13 | def main():
14 | func_argparse.parse_and_call(cc_net.mine.get_main_parser())
15 |
16 |
17 | if __name__ == "__main__":
18 | main()
19 |
--------------------------------------------------------------------------------
/kenlm_training/cc_net/data/test_stats.json:
--------------------------------------------------------------------------------
1 | {
2 | "2019-09/de_head_0000.json.gz": {
3 | "size": 5264993,
4 | "checksum": "fc12ba3dc982ef06e7e44a916f298e1c16f9a806"
5 | },
6 | "2019-09/de_middle_0000.json.gz": {
7 | "size": 9195535,
8 | "checksum": "2369ff0296ab1d924c81083f17ce41f22a10ad69"
9 | },
10 | "2019-09/de_tail_0000.json.gz": {
11 | "size": 33029074,
12 | "checksum": "18865040a7263242d298958f358f7cb5511114d4"
13 | },
14 | "2019-09/fr_head_0000.json.gz": {
15 | "size": 4076580,
16 | "checksum": "4eef4017bbbe042fc01c45b5fbcf94de49f5138e"
17 | },
18 | "2019-09/fr_middle_0000.json.gz": {
19 | "size": 8075095,
20 | "checksum": "fd251a5b924c4aa66a63c375ca3a8fae23b3273b"
21 | },
22 | "2019-09/fr_tail_0000.json.gz": {
23 | "size": 27248949,
24 | "checksum": "4a8aed38abc6b9d04459e8d424bd47426f063638"
25 | },
26 | "2019-09/it_head_0000.json.gz": {
27 | "size": 1760696,
28 | "checksum": "e5e50e49b4a5147ea82b385babd5c83f74d2a4ed"
29 | },
30 | "2019-09/it_middle_0000.json.gz": {
31 | "size": 4461832,
32 | "checksum": "7daab7b7acb93d81e50534196ada4e94947b8224"
33 | },
34 | "2019-09/it_tail_0000.json.gz": {
35 | "size": 14754298,
36 | "checksum": "1adc018519a598ff162261d7e480ea41d3458768"
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/kenlm_training/cc_net/get_hf_dataset.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import func_argparse
4 | from datasets import load_dataset
5 | from tqdm import tqdm
6 |
7 | from cc_net import text_normalizer
8 |
9 |
10 | def dl(
11 | dataset: str,
12 | output_file: str,
13 | name: Optional[str] = None,
14 | data_dir: Optional[str] = None,
15 | data_files: Optional[str] = None,
16 | split: Optional[str] = None,
17 | streaming: bool = True,
18 | accent: bool = False,
19 | case: bool = False,
20 | numbers: bool = True,
21 | punct: int = 1,
22 | max_docs: Optional[int] = None,
23 | seed: int = 0,
24 | buffer_size: int = 10000,
25 | ):
26 | """Download dataset from the Hugging Face hub."""
27 | dataset = load_dataset(
28 | dataset,
29 | name=name,
30 | data_dir=data_dir,
31 | data_files=data_files,
32 | split=split,
33 | streaming=streaming,
34 | )
35 | dataset_norm = dataset.map(
36 | lambda x: text_normalizer.normalize(
37 | x["text"], accent=accent, case=case, numbers=numbers, punct=punct
38 | )
39 | )
40 | dataset_norm = dataset_norm.shuffle(buffer_size=buffer_size, seed=seed)
41 | count = 0
42 | with open(output_file, "w") as o:
43 | with tqdm(total=max_docs) as pbar:
44 | for doc in dataset_norm:
45 | count += 1
46 | doc = doc.rstrip("\n")
47 | print(doc, file=o)
48 | if max_docs and count == max_docs:
49 | break
50 | pbar.update(1)
51 |
52 |
53 | if __name__ == "__main__":
54 | func_argparse.main(dl)
55 |
--------------------------------------------------------------------------------
/kenlm_training/cc_net/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/kenlm_training/cc_net/tools/__init__.py
--------------------------------------------------------------------------------
/kenlm_training/config/lid_exp.json:
--------------------------------------------------------------------------------
1 | {
2 | "output_dir": "/checkpoint/guw/cc_clean2/",
3 | "dump": "2019-09",
4 | "num_shards": 1600,
5 | "pipeline": [
6 | "lid_before_dedup",
7 | "dedup",
8 | "lid_after_dedup"
9 | ],
10 | "hash_in_mem": 50,
11 | "execution": "slurm"
12 | }
13 |
--------------------------------------------------------------------------------
/kenlm_training/config/mine_segment.json:
--------------------------------------------------------------------------------
1 | {
2 | "dump": "2019-09",
3 | "mined_dir": "mini_by_segment",
4 | "pipeline": [
5 | "dedup",
6 | "lid",
7 | "keep_lang",
8 | "sp",
9 | "lm",
10 | "pp_bucket",
11 | "minify",
12 | "split_by_segment"
13 | ],
14 | "execution": "slurm"
15 | }
16 |
--------------------------------------------------------------------------------
/kenlm_training/config/test_reproduce.json:
--------------------------------------------------------------------------------
1 | {
2 | "hash_in_mem": 2,
3 | "dump": "2019-09",
4 | "num_shards": 4,
5 | "num_segments_per_shard": 1,
6 | "pipeline": [
7 | "fetch_metadata",
8 | "split_by_lang"
9 | ],
10 | "metadata": "test_data2/mined_by_segment",
11 | "execution": "debug",
12 | "output_dir": "test_data2",
13 | "mined_dir": "reproduce",
14 | "target_size": "32M",
15 | "cache_dir": "test_data/wet_cache"
16 | }
17 |
--------------------------------------------------------------------------------
/kenlm_training/config/test_segment.json:
--------------------------------------------------------------------------------
1 | {
2 | "hash_in_mem": 2,
3 | "dump": "2019-09",
4 | "num_shards": 4,
5 | "num_segments_per_shard": 1,
6 | "mine_num_processes": 0,
7 | "lang_whitelist": ["de", "it", "fr"],
8 | "pipeline": [
9 | "dedup",
10 | "lid",
11 | "keep_lang",
12 | "sp",
13 | "lm",
14 | "pp_bucket",
15 | "minify",
16 | "split_by_segment"
17 | ],
18 | "execution": "debug",
19 | "output_dir": "test_data2",
20 | "mined_dir": "mined_by_segment",
21 | "target_size": "32M",
22 | "cache_dir": "test_data/wet_cache"
23 | }
24 |
--------------------------------------------------------------------------------
/kenlm_training/pyproject.toml:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = "tests"
3 |
4 | [tool.black]
5 | line-length = 88
6 | target_version = ["py37"]
7 |
8 | [tool.isort]
9 | multi_line_output = 3
10 | include_trailing_comma = true
11 | force_grid_wrap = 0
12 | use_parentheses = true
13 | line_length = 88
14 | known_third_party = ["func_argparse"]
15 | skip = ["third_party", "data"]
16 |
17 | [mypy]
18 | python_version = 3.7
19 | check_untyped_defs = true
20 |
21 | [mypy-numpy]
22 | ignore_missing_imports = true
23 | [mypy-pytest]
24 | ignore_missing_imports = true
25 |
--------------------------------------------------------------------------------
/kenlm_training/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | # This source code is licensed under the license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | from pathlib import Path
7 |
8 | from setuptools import setup # type: ignore
9 |
10 | setup(
11 | name="cc_net",
12 | version="1.0.0",
13 | packages=["cc_net"],
14 | # metadata to display on PyPI
15 | author="Guillaume Wenzek",
16 | author_email="guw@fb.com",
17 | description="Tools to download and clean Common Crawl",
18 | keywords="common crawl dataset",
19 | url="https://github.com/facebookresearch/cc_net",
20 | license="CC-BY-NC-4.0",
21 | long_description=Path("README.md").read_text(),
22 | long_description_content_type="text/markdown",
23 | project_urls={
24 | "Bug Tracker": "https://github.com/facebookresearch/cc_net/issues",
25 | "Source Code": "https://github.com/facebookresearch/cc_net",
26 | },
27 | classifiers=[
28 | "Development Status :: 4 - Beta",
29 | "Programming Language :: Python :: 3.7",
30 | ],
31 | python_requires=">=3.7",
32 | install_requires=[
33 | "beautifulsoup4>=4.7.1",
34 | "pandas>=0.23.4",
35 | "requests>=2.22.0",
36 | "fasttext>=0.9.1",
37 | "sentencepiece>=0.1.82",
38 | "kenlm @ git+https://github.com/kpu/kenlm.git@master",
39 | "func_argparse>=1.1.1",
40 | "psutil>=5.6.3",
41 | "sacremoses",
42 | "submitit>=1.0.0",
43 | "typing_extensions",
44 | "datasets==1.16.1",
45 | ],
46 | extras_require={
47 | "dev": ["mypy==0.790", "pytest", "black==19.3b0", "isort==5.6.4"],
48 | # To use scripts inside cc_net/tools
49 | "tools": ["lxml", "sentence_splitter"],
50 | # Memory-efficient hashset.
51 | # This fork only compiles the kind of dict used by cc_net.
52 | # Full version is at https://github.com/atom-moyer/getpy
53 | "getpy": ["getpy @ git+https://github.com/gwenzek/getpy.git@v0.9.10-subset"],
54 | },
55 | package_data={"cc_net": ["data/*"]},
56 | )
57 |
--------------------------------------------------------------------------------
/kenlm_training/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 | #
7 |
--------------------------------------------------------------------------------
/kenlm_training/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 |
7 | import pytest
8 |
9 |
10 | def _request_is_disabled(self, *args, **kwargs):
11 | raise Exception(
12 | f"Your code tried to call 'request' with: {args}, {kwargs}. Unit test aren't allowed to reach internet."
13 | )
14 |
15 |
16 | @pytest.fixture(autouse=True)
17 | def no_requests(monkeypatch):
18 | """Remove requests.sessions.Session.request for all tests."""
19 | monkeypatch.setattr("requests.sessions.Session.request", _request_is_disabled)
20 |
--------------------------------------------------------------------------------
/kenlm_training/tests/test_normalizer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 |
7 | import cc_net.text_normalizer as txt
8 |
9 |
10 | def test_unicode_punct():
11 | weird = ",。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%"
12 | replaced = ',.,""""""""""\'::?!();- - . ~\'...-<>[]%'
13 | assert txt.replace_unicode_punct(weird) == replaced
14 |
15 | assert txt.remove_unicode_punct(weird) == ""
16 |
17 |
18 | def test_numbers():
19 | weird = "023456789 | 0123456789"
20 | normalized = "000000000 | 0000000000"
21 | assert txt.normalize(weird, numbers=True) == normalized
22 | assert txt.normalize(weird, numbers=False) == weird
23 |
24 |
25 | def test_normalize_for_dedup():
26 | weird = "023´∶:\x10 | ;012 hèllo"
27 | normalized = "000 | ;000 hèllo"
28 | assert normalized == txt.slow_normalize_for_dedup(weird)
29 | assert normalized == txt.normalize_for_dedup(weird)
30 |
--------------------------------------------------------------------------------
/kenlm_training/tests/test_parse_wet_file.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 |
7 | from pathlib import Path
8 |
9 | from cc_net import process_wet_file
10 |
11 |
12 | def test_parsing():
13 | sample = Path(__file__).parent / "data" / "sample.warc.txt"
14 | with open(sample) as f:
15 | documents = list(process_wet_file.parse_warc_file(f))
16 |
17 | expected_urls = [
18 | "http://sample_english.com",
19 | "http://sample_chinese.zh",
20 | "http://sample_russian.ru",
21 | ]
22 | assert expected_urls == [d["url"] for d in documents]
23 | expected_domains = ["sample_english.com", "sample_chinese.zh", "sample_russian.ru"]
24 | assert expected_domains == [d["source_domain"] for d in documents]
25 |
26 | expected_date = [
27 | "2019-03-18T00:00:00Z",
28 | "2019-03-18T00:00:01Z",
29 | "2019-03-18T00:00:02Z",
30 | ]
31 | assert expected_date == [d["date_download"] for d in documents]
32 |
33 | expected_title = [
34 | "Famous Mark Twain Quotes",
35 | "馬克·吐溫名言",
36 | "Цитаты знаменитого Марка Твена",
37 | ]
38 | assert expected_title == [d["title"] for d in documents]
39 |
40 | expected_quotes = """Don't part with your illusions. When they are gone you may still exist, but you have ceased to live.
41 | Education: that which reveals to the wise, and conceals from the stupid, the vast limits of their knowledge.
42 |
43 | Facts are stubborn things, but statistics are more pliable.
44 | Fiction is obliged to stick to possibilities. Truth isn't.
45 | """
46 |
47 | assert expected_quotes == documents[0]["raw_content"]
48 |
--------------------------------------------------------------------------------
/kenlm_training/tests/test_regroup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 |
7 | import time
8 |
9 | from cc_net import jsonql, regroup
10 |
11 |
12 | def check_regroup(tmp_path, regroup_fn, check_blocks_boundaries=False):
13 | n_shards = 4
14 | n_docs = 20
15 | shards = [
16 | [dict(id=i, shard=s, raw_content="hello world") for i in range(n_docs)]
17 | for s in range(n_shards)
18 | ]
19 | shards_files = [tmp_path / f"{s:04d}.json.gz" for s in range(n_shards)]
20 | for shard, shard_file in zip(shards, shards_files):
21 | jsonql.run_pipes(inputs=shard, output=shard_file)
22 | regroup_file = tmp_path / "regroup.json.gz"
23 | start = time.time()
24 | regroup_fn(shards_files, regroup_file)
25 | duration = time.time() - start
26 | print(f"{regroup_fn.__module__}.{regroup_fn.__name__} took {duration}s")
27 |
28 | regrouped = list(jsonql.read_jsons(regroup_file))
29 | assert [doc for shard in shards for doc in shard] == regrouped
30 |
31 | readers = jsonql.get_block_readers(regroup_file, n_shards)
32 | if not check_blocks_boundaries:
33 | assert [doc for shard in shards for doc in shard] == [
34 | doc for reader in readers for doc in jsonql.read_jsons(reader)
35 | ]
36 | return
37 |
38 | for shard, reader in zip(shards, readers):
39 | block = [doc for doc in jsonql.read_jsons(reader)]
40 | assert shard == block
41 |
42 |
43 | def test_regroup(tmp_path):
44 | # With regroup boundaries will be every 256Mb.
45 | check_regroup(tmp_path, regroup.reshard, check_blocks_boundaries=False)
46 |
47 |
48 | def test_fast_regroup(tmp_path):
49 | # With fast regroup boundaries should match the shards.
50 | check_regroup(tmp_path, regroup.fast_reshard, check_blocks_boundaries=True)
51 |
--------------------------------------------------------------------------------
/perplexity_lenses/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Perplexity Lenses
3 | emoji: 🌸
4 | colorFrom: pink
5 | colorTo: blue
6 | sdk: streamlit
7 | app_file: app.py
8 | pinned: false
9 | ---
10 |
11 | # Installation:
12 | Requires Python >= 3.7 and < 3.10
13 | ```
14 | pip install .
15 | ```
16 | Or with [poetry](https://python-poetry.org/)
17 | ```
18 | poetry install
19 | ```
20 |
21 | # Web App:
22 | The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
23 | ```
24 | python -m streamlit run app.py
25 | ```
26 |
27 | # CLI:
28 | The CLI with no arguments defaults to running mc4 in Spanish.
29 | For full usage:
30 | ```
31 | python cli.py --help
32 | ```
33 | Example: Running on 1000 sentences extracted from Spanish OSCAR docs specifying all arguments:
34 | ```
35 | python cli.py \
36 | --dataset oscar \
37 | --dataset-config unshuffled_deduplicated_es \
38 | --dataset-split train \
39 | --text-column text \
40 | --language es \
41 | --doc-type sentence \
42 | --sample 1000 \
43 | --dimensionality-reduction umap \
44 | --model-name distiluse-base-multilingual-cased-v1 \
45 | --output-file perplexity.html
46 | ```
47 | # Tests:
48 | ```
49 | python -m unittest discover -s ./tests/ -p "test_*.py"
50 | ```
51 |
--------------------------------------------------------------------------------
/perplexity_lenses/perplexity_lenses/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | REGISTRY_DATASET = "mhtoin/register_oscar"
3 |
--------------------------------------------------------------------------------
/perplexity_lenses/perplexity_lenses/visualization.py:
--------------------------------------------------------------------------------
1 | import matplotlib.figure
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 |
6 | def draw_histogram(
7 | values: np.ndarray,
8 | cutoff_x_axis: float = 2000.0,
9 | title: str = "Perplexity histogram",
10 | xlabel: str = "Perplexity",
11 | ) -> matplotlib.figure.Figure:
12 | hist_values = values[values < cutoff_x_axis]
13 | fig, ax = plt.subplots(figsize=(12, 9))
14 | ax.hist(hist_values, bins=50)
15 | ax.set_title(title)
16 | ax.set_xlabel(xlabel)
17 | ax.set_ylabel("Counts")
18 | return fig
19 |
--------------------------------------------------------------------------------
/perplexity_lenses/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "perplexity-lenses"
3 | version = "0.1.0"
4 | description = ""
5 | authors = ["edugp "]
6 |
7 | [tool.poetry.dependencies]
8 | python = ">=3.7,<3.10"
9 | huggingface-hub = "0.0.19"
10 | streamlit = "1.1.0"
11 | transformers = "4.11.3"
12 | watchdog = "2.1.3"
13 | sentence-transformers = "2.0.0"
14 | bokeh = "2.2.2"
15 | numpy = "1.20.0"
16 | numba = "^0.54.1"
17 | umap-learn = "^0.5.2"
18 | datasets = "1.14.0"
19 | black = "^21.10b0"
20 | flake8 = "^4.0.1"
21 | scikit-learn = "0.24.2"
22 | kenlm = {url = "https://github.com/kpu/kenlm/archive/master.zip"}
23 | embedding-lenses = "0.9.0"
24 | typer = "^0.4.0"
25 |
26 | [tool.poetry.dev-dependencies]
27 | pytest = "^5.2"
28 |
29 | [tool.poetry.scripts]
30 | cli = "cli:app"
31 |
32 | [tool.black]
33 | target-version = ["py38"]
34 |
35 | [tool.isort]
36 | profile = "black"
37 | line_length = 160
38 | multi_line_output = 3
39 | include_trailing_comma = true
40 |
41 | [build-system]
42 | requires = ["poetry-core>=1.0.0"]
43 | build-backend = "poetry.core.masonry.api"
44 |
--------------------------------------------------------------------------------
/perplexity_lenses/requirements.txt:
--------------------------------------------------------------------------------
1 | bokeh==2.2.2
2 | embedding-lenses==0.9.0
3 | https://github.com/kpu/kenlm/archive/master.zip
4 | huggingface-hub==0.0.19
5 | numpy==1.20.0
6 | sentence-transformers==2.0.0
7 | streamlit==1.1.0
8 | transformers==4.11.3
9 | typer==0.4.0
10 | umap-learn==0.5.2
11 | watchdog==2.1.3
12 |
--------------------------------------------------------------------------------
/perplexity_lenses/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/perplexity_lenses/tests/__init__.py
--------------------------------------------------------------------------------
/perplexity_lenses/tests/test_data.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pandas as pd
4 |
5 | from perplexity_lenses.data import documents_df_to_sentences_df
6 |
7 |
8 | class TestData(unittest.TestCase):
9 | def test_documents_df_to_sentences_df(self):
10 | input_df = pd.DataFrame({"text": ["foo\nbar"]})
11 | expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
12 | output_df = documents_df_to_sentences_df(input_df, "text", 100)
13 | pd.testing.assert_frame_equal(
14 | output_df, expected_output_df, check_like=True, check_exact=True
15 | )
16 |
--------------------------------------------------------------------------------
/pii-manager/CHANGES.md:
--------------------------------------------------------------------------------
1 | v. 0.5.0
2 | * new task list parsing code, adding a "full" format based on dicts, in
3 | addition to the previous "simplified" format based on tuples
4 | * refactored to allow more than one task for a given PII and country
5 | * added the capability to add task descriptors programmatically
6 | * added reading task descriptors from a JSON file
7 | * context validation spec, for all three task implementation types
8 | * TASK_ANY split into LANG_ANY & COUNTRY_ANY
9 | * PII detectors for international phone numbers, for en-any & es-any
10 | * PII detector for IP addresses, language independent
11 | * PII detectors for GOV_ID
12 | - lang pt, countries PT & BR
13 | - lang es, country MX
14 |
15 | v. 0.4.0
16 | * PII GOV_ID task for es-ES and en-AU
17 | * PII EMAIL_ADDRESS task
18 | * PyPi Makefile targets; fixed setup.py
19 |
20 | v. 0.3.0
21 | * new processing mode: `full`
22 | * PII detectors for zh-CN
23 | * added `regex` as dependency
24 | * `regex` used for regular expression tasks instead of `re`
25 |
26 | v. 0.2.0
27 | * Added PII tasks:
28 | - en: GOV_ID for US, CA, IN
29 | - fr: GOV_ID for CA
30 | * fix paths for languages/countries that are reserved Python words (is, in)
31 | * added country information to PiiEntity
32 | * added an _asdict() function for PiiEntities
33 | * added PII country to task_info
34 | * miscellaneous fixes
35 |
--------------------------------------------------------------------------------
/pii-manager/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/pii-manager/README.md:
--------------------------------------------------------------------------------
1 | # Pii Manager
2 |
3 | This repository builds a Python package that performs PII processing for text
4 | data i.e. replacement/tagging/extraction of PII (Personally Identifiable
5 | Information aka [Personal Data]) items existing in the text.
6 |
7 | The PII Tasks in the package are structured by language & country, since many
8 | of the PII elements are language- and/or -country dependent.
9 |
10 | ## Requirements
11 |
12 | The package needs at least Python 3.8, and uses the [python-stdnum] package to
13 | validate identifiers.
14 |
15 | ## Usage
16 |
17 | The package can be used:
18 | * As an API, in two flavors: function-based API and object-based API
19 | * As a command-line tool
20 |
21 | For details, see the [usage document].
22 |
23 |
24 | ## Building
25 |
26 | The provided [Makefile] can be used to process the package:
27 | * `make pkg` will build the Python package, creating a file that can be
28 | installed with `pip`
29 | * `make unit` will launch all unit tests (using [pytest], so pytest must be
30 | available)
31 | * `make install` will install the package in a Python virtualenv. The
32 | virtualenv will be chosen as, in this order:
33 | - the one defined in the `VENV` environment variable, if it is defined
34 | - if there is a virtualenv activated in the shell, it will be used
35 | - otherwise, a default is chosen as `/opt/venv/bigscience` (it will be
36 | created if it does not exist)
37 |
38 |
39 | ## Contributing
40 |
41 | To add a new PII processing task, please see the [contributing instructions].
42 |
43 |
44 | [python-stdnum]: https://github.com/arthurdejong/python-stdnum
45 | [Makefile]: Makefile
46 | [pytest]: https://docs.pytest.org
47 | [contributing instructions]: doc/contributing.md
48 | [usage document]: doc/usage.md
49 | [Personal Data]: https://en.wikipedia.org/wiki/Personal_data
50 |
--------------------------------------------------------------------------------
/pii-manager/doc/external.md:
--------------------------------------------------------------------------------
1 | # Adding external task processors to a processing object
2 |
3 | In addition to the task processorts contained inside the [lang] subfolders in
4 | the package, it is also possible to add _external_ task processors define
5 | outside the package, as long as they comply with the [task specification].
6 | This can be done for both the object-base API and the file-based API.
7 |
8 |
9 | ## Object-based API
10 |
11 | An instantiated `ProcManager` object contains the `add_tasks` method. This
12 | method will accept a list of [task descriptors] with the same syntax as the
13 | internal `PII_TASKS` descriptors, and will add the tasks defined in them to
14 | the existing ones in the object.
15 |
16 |
17 | ## File-based API
18 |
19 | The file-based `process_file` function allows a `taskfile` argument. This
20 | argument will contain the name of a JSON file that contains an array of task
21 | descriptors. Each task descriptor in the array is a JSON object following the
22 | specification for [task descriptors], with these differences:
23 |
24 | * The `pii` field is not a `PiiEnum` object, but a string with the _name_ of
25 | a `PiiEnum` object. It will be converted to the object itself.
26 | * The `task` field contains:
27 | - for `regex` types, the string with the regular expression pattern to be
28 | compiled (beware of escaping all backlashes in the string)
29 | - for `callable` and `PiiTask` types, a string with the **fully
30 | qualified** name of the function to be used or class to be instantiated.
31 | As long as that name can be located in the running Python space (i.e.
32 | it is in the load path), it will be imported and used.
33 |
34 |
35 | [lang]: ../src/pii_manager/lang
36 | [task specification]: tasks.md
37 | [task descriptors]: contributing.md#task-descriptor
38 |
--------------------------------------------------------------------------------
/pii-manager/requirements.txt:
--------------------------------------------------------------------------------
1 | python-stdnum >=1.17,<2.0
2 | regex >= 2021.11.10
3 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = "0.5.0"
2 |
3 | from .piienum import PiiEnum
4 | from .piientity import PiiEntity
5 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .manager import PiiManager
2 | from .file import process_file
3 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/app/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/helper/__init__.py:
--------------------------------------------------------------------------------
1 | from .taskdict import get_taskdict, country_list
2 | from .base import BasePiiTask
3 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/helper/exception.py:
--------------------------------------------------------------------------------
1 | class PiiManagerException(Exception):
2 | def __init__(self, msg, *args):
3 | super().__init__(msg.format(*args))
4 |
5 |
6 | class InvArgException(PiiManagerException):
7 | pass
8 |
9 |
10 | class PiiUnimplemented(PiiManagerException):
11 | pass
12 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/helper/json.py:
--------------------------------------------------------------------------------
1 | """
2 | Provide a custom JSON encoder that can serialize additional objects,
3 | in particular PiiEntity objects
4 | """
5 |
6 |
7 | from collections.abc import Iterator
8 | import datetime
9 | import json
10 |
11 |
12 | def keygetter_set(v):
13 | return str(v).lower()
14 |
15 |
16 | class CustomJSONEncoder(json.JSONEncoder):
17 | """
18 | A custom JSON encoder that can serialize additional objects:
19 | - datetime objects (into ISO 8601 strings)
20 | - sets (as sorted lists)
21 | - iterators (as lists)
22 | - any object having a to_json() method that produces a string or
23 | a serializable object
24 |
25 | Non-serializable objects are converted to plain strings.
26 | """
27 |
28 | def default(self, obj):
29 | """
30 | Serialize some special types
31 | """
32 | if hasattr(obj, "to_json"):
33 | return obj.to_json()
34 | elif isinstance(obj, datetime.datetime):
35 | t = obj.strftime("%Y-%m-%dT%H:%M:%S.%f%z")
36 | if obj.tzinfo is not None:
37 | t = t[:-2] + ":" + t[-2:]
38 | return t
39 | elif isinstance(obj, set):
40 | return sorted(obj, key=keygetter_set)
41 | elif isinstance(obj, Iterator):
42 | return list(obj)
43 |
44 | try:
45 | return super().default(self, obj)
46 | except TypeError:
47 | return str(obj)
48 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/helper/normalizer.py:
--------------------------------------------------------------------------------
1 | def normalize(
2 | text: str, lang: str, whitespace: bool = True, lowercase: bool = False
3 | ) -> str:
4 | """
5 | Perforn some normalization steps on a text string
6 | """
7 | if whitespace:
8 | text = " ".join(text.split())
9 |
10 | if lowercase:
11 | text = text.lower()
12 |
13 | return text
14 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/helper/types.py:
--------------------------------------------------------------------------------
1 | from typing import Union, List
2 |
3 | TYPE_STR_LIST = Union[str, List[str]]
4 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/__init__.py:
--------------------------------------------------------------------------------
1 | # Folder for language-independent tasks
2 | LANG_ANY = "any"
3 |
4 | # Country-independent tasks
5 | COUNTRY_ANY = "any"
6 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/any/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/any/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/any/bitcoin_address.py:
--------------------------------------------------------------------------------
1 | """
2 | Find valid bitcoin addresses
3 | 1. Obtain candidates, by using a generic regex expression
4 | 2. Validate candidates by
5 | - using a more exact regex
6 | - validating the number through the Luhn algorithm
7 | """
8 |
9 | import re
10 |
11 | from typing import Iterable
12 |
13 | from stdnum import bitcoin
14 |
15 | from pii_manager import PiiEnum
16 |
17 | # ----------------------------------------------------------------------------
18 |
19 | # regex for the three types of bitcoin addresses
20 | _BITCOIN_PATTERN = (
21 | r"( [13] ["
22 | + bitcoin._base58_alphabet
23 | + "]{25,34}"
24 | + "| bc1 ["
25 | + bitcoin._bech32_alphabet
26 | + "]{8,87})"
27 | )
28 |
29 | _REGEX_BITCOIN = re.compile(_BITCOIN_PATTERN, flags=re.X)
30 |
31 |
32 | def bitcoin_address(text: str) -> Iterable[str]:
33 | """
34 | Bitcoin addresses (P2PKH, P2SH and Bech32), recognize & validate
35 | """
36 | # Find and validate candidates
37 | for ba in _REGEX_BITCOIN.findall(text):
38 | if bitcoin.is_valid(ba):
39 | yield ba
40 |
41 |
42 | # ---------------------------------------------------------------------
43 |
44 | PII_TASKS = [(PiiEnum.BITCOIN_ADDRESS, bitcoin_address)]
45 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/any/email.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of email addresses
3 | """
4 |
5 | from pii_manager import PiiEnum
6 |
7 |
8 | _EMAIL_PATTERN = r"[\w\.=-]+ @ [\w\.-]+ \. [\w]{2,3}"
9 |
10 |
11 | PII_TASKS = [(PiiEnum.EMAIL_ADDRESS, _EMAIL_PATTERN, "Email address")]
12 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/any/ip_address.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of IP addresses
3 | """
4 |
5 | from pii_manager import PiiEnum
6 |
7 |
8 | _IP_PATTERN = r"""
9 | \b
10 | (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \. ){3}
11 | (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]?)
12 | \b
13 | """
14 |
15 |
16 | PII_TASKS = [
17 | {
18 | "pii": PiiEnum.IP_ADDRESS,
19 | "type": "regex",
20 | "task": _IP_PATTERN,
21 | "name": "ip address",
22 | "doc": "match IP addresses, with context",
23 | "context": {"value": "ip", "type": "word", "width": 16},
24 | }
25 | ]
26 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/any/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/any/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/any/international_phone_number.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of phone numbers written with international notation (i.e. with
3 | prefix and country code)
4 | """
5 |
6 |
7 | from pii_manager import PiiEnum
8 |
9 | PATTERN_INT_PHONE = r"""
10 | (?:\+ | 00)
11 | (?: 9[976]\d | 8[987530]\d | 6[987]\d | 5[90]\d | 42\d |
12 | 3[875]\d | 2[98654321]\d | 9[8543210] | 8[6421] |
13 | 6[6543210] | 5[87654321] | 4[987654310] | 3[9643210] |
14 | 2[70] | 7 | 1)
15 | [-\x20\.]?
16 | (?: \d{2,3} [-\x20]? ){3,4}
17 | """
18 |
19 | PII_TASKS = [
20 | {
21 | "pii": PiiEnum.PHONE_NUMBER,
22 | "type": "regex",
23 | "task": PATTERN_INT_PHONE,
24 | "name": "international phone number",
25 | "doc": "detect phone numbers that use international notation. Uses context",
26 | "context": {"value": ["ph", "phone", "fax"], "width": [16, 0], "type": "word"},
27 | }
28 | ]
29 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/au/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/au/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/au/abn.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection and validation of Australian business number (ABN).
3 |
4 | """
5 | import re
6 |
7 | from stdnum.au import abn
8 |
9 | from typing import Iterable
10 |
11 | from pii_manager import PiiEnum
12 |
13 |
14 | _ABN_PATTERN = r"\b (?: \d{2} \s \d{3} \s \d{3} \s \d{3} | \d{11} ) \b"
15 | _ABN_REGEX = re.compile(_ABN_PATTERN, flags=re.X)
16 |
17 |
18 | def australian_business_number(doc: str) -> Iterable[str]:
19 | """
20 | Australian Business Number (detect and validate)
21 | """
22 | for candidate in _ABN_REGEX.findall(doc):
23 | if abn.is_valid(candidate):
24 | yield candidate
25 |
26 |
27 | PII_TASKS = [(PiiEnum.GOV_ID, australian_business_number)]
28 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/au/tfn.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection and validation of Australian Tax File Number (TFN).
3 |
4 | """
5 | import re
6 |
7 | from stdnum.au import tfn
8 |
9 | from typing import Iterable
10 |
11 | from pii_manager import PiiEnum
12 |
13 |
14 | _TFN_PATTERN = r"\b (?: \d{3} \s \d{3} \s \d{3} | \d{8,9} ) \b"
15 | _TFN_REGEX = re.compile(_TFN_PATTERN, flags=re.X)
16 |
17 |
18 | def tax_file_number(doc: str) -> Iterable[str]:
19 | """
20 | Australian Tax File Number (detect and validate)
21 | """
22 | for candidate in _TFN_REGEX.findall(doc):
23 | if tfn.is_valid(candidate):
24 | yield candidate
25 |
26 |
27 | PII_TASKS = [(PiiEnum.GOV_ID, tax_file_number)]
28 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/ca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/ca/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/ca/social_insurance_number.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection and validation of Canadian Social Insurance Number
3 |
4 | Since it contains a check digit, it can be validated.
5 | """
6 |
7 | import re
8 |
9 | from stdnum.ca import sin
10 |
11 | from typing import Iterable
12 |
13 | from pii_manager import PiiEnum
14 |
15 |
16 | _SIN_REGEX = re.compile(r"\d{3}[-\ ]\d{3}[-\ ]\d{3}", flags=re.X)
17 |
18 |
19 | def social_insurance_number(doc: str) -> Iterable[str]:
20 | """
21 | Canadian Social Insurance Number (detect and validate)
22 | """
23 | for candidate in _SIN_REGEX.findall(doc):
24 | if sin.is_valid(candidate):
25 | yield candidate
26 |
27 |
28 | PII_TASKS = [(PiiEnum.GOV_ID, social_insurance_number)]
29 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/in_/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/in_/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/in_/aadhaar.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection and validation of Indian Aadhaar identity number
3 |
4 | Since it contains a check digit, it can be validated.
5 | """
6 |
7 | import re
8 |
9 | from stdnum.in_ import aadhaar
10 |
11 | from typing import Iterable
12 |
13 | from pii_manager import PiiEnum
14 |
15 |
16 | _AADHAAR_REGEX = re.compile(r"[2-9]\d{3}\ ?\d{4}\ ?\d{4}", flags=re.X)
17 |
18 |
19 | def aadhaar_number(doc: str) -> Iterable[str]:
20 | """
21 | Aadhaar identity number from India (detect and validate)
22 | """
23 | for candidate in _AADHAAR_REGEX.findall(doc):
24 | if aadhaar.is_valid(candidate):
25 | yield candidate
26 |
27 |
28 | PII_TASKS = [(PiiEnum.GOV_ID, aadhaar_number)]
29 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/us/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/en/us/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/en/us/social_security_number.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of U.S. Social Security Number.
3 |
4 | We just match on the number, it cannot be
5 | validated using only the number since it does not carry a checksum
6 | """
7 |
8 | from pii_manager import PiiEnum
9 |
10 |
11 | _SSN_PATTERN = r"(?!000|666|333)0*(?:[0-6][0-9][0-9]|[0-7][0-6][0-9]|[0-7][0-7][0-2])[-\ ](?!00)[0-9]{2}[-\ ](?!0000)[0-9]{4}"
12 |
13 |
14 | PII_TASKS = [
15 | (PiiEnum.GOV_ID, _SSN_PATTERN, "U.S. Social Security Number (detect only)")
16 | ]
17 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/any/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/any/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/any/international_phone_number.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of phone numbers written with international notation (i.e. with
3 | prefix and country code), for ES
4 | """
5 |
6 |
7 | from pii_manager import PiiEnum
8 |
9 | # The pattern for the regex is the same as for English
10 | from ...en.any.international_phone_number import PATTERN_INT_PHONE
11 |
12 |
13 | PII_TASKS = [
14 | {
15 | "pii": PiiEnum.PHONE_NUMBER,
16 | "type": "regex",
17 | "task": PATTERN_INT_PHONE,
18 | "name": "international phone number",
19 | "doc": "detect phone numbers that use international notation. Uses language context",
20 | "context": {
21 | "value": ["tf", "teléfono", "telefono"],
22 | "width": [16, 0],
23 | "type": "word",
24 | },
25 | }
26 | ]
27 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/es/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/es/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/es/bank_account.py:
--------------------------------------------------------------------------------
1 | """
2 | Spanish bank account numbers (CCC - código cuenta cliente)
3 |
4 | Note: **NOT** IBAN numbers, those are country (& language) independent
5 | """
6 |
7 | import re
8 |
9 | from typing import Iterable
10 |
11 | from stdnum.es import ccc
12 |
13 | from pii_manager import PiiEnum
14 |
15 | # ----------------------------------------------------------------------------
16 |
17 | # regex for a Código Cuenta Cliente, with optional spaces separating the pieces
18 | _CCC_PATTERN = r"\d{4}\s?\d{4}\s?\d{2}\s?\d{10}"
19 |
20 | # compiled regex
21 | _REGEX_CCC = None
22 |
23 |
24 | def spanish_bank_ccc(text: str) -> Iterable[str]:
25 | """
26 | Spanish Bank Accounts (código cuenta cliente, 10-digit code, pre-IBAN), recognize & validate
27 | """
28 | # Compile regex if needed
29 | global _REGEX_CCC
30 | if _REGEX_CCC is None:
31 | _REGEX_CCC = re.compile(_CCC_PATTERN, flags=re.X)
32 | # Find all CCCs
33 | for item in _REGEX_CCC.findall(text):
34 | if ccc.is_valid(item):
35 | yield item
36 |
37 |
38 | # ---------------------------------------------------------------------
39 |
40 | PII_TASKS = [(PiiEnum.BANK_ACCOUNT, spanish_bank_ccc)]
41 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/es/govid.py:
--------------------------------------------------------------------------------
1 | """
2 | Spanish Goverment-issued IDs:
3 | - DNI (Documento Nacional de Identidad)
4 | - NIE (Número de Identificación de Extranjero)
5 | """
6 |
7 | import re
8 |
9 | from typing import Iterable
10 |
11 | from stdnum.es import dni, nie
12 |
13 | from pii_manager import PiiEnum, PiiEntity
14 | from pii_manager.helper import BasePiiTask
15 |
16 | # regex for DNI & NIE
17 | _DNI_PATTERN = r"\d{6,8} -? [A-KJ-NP-TV-Z]"
18 | _NIE_PATTERN = r"[X-Z] \d{7} -? [A-KJ-NP-TV-Z]"
19 |
20 |
21 | class SpanishDniNie(BasePiiTask):
22 | """
23 | Spanish Government-issued DNI & NIE numbers, recognize & validate
24 | """
25 |
26 | pii_name = "Spanish DNI and NIE numbers"
27 |
28 | def __init__(self, **kwargs):
29 | super().__init__(**kwargs)
30 | # Compile the regexes
31 | self.dni = re.compile(_DNI_PATTERN, flags=re.X)
32 | self.nie = re.compile(_NIE_PATTERN, flags=re.X)
33 |
34 | def find(self, doc: str) -> Iterable[PiiEntity]:
35 | # DNI
36 | for item in self.dni.finditer(doc):
37 | item_value = item.group()
38 | if dni.is_valid(item_value):
39 | yield PiiEntity(
40 | PiiEnum.GOV_ID,
41 | item.start(),
42 | item_value,
43 | country=self.country,
44 | name="Spanish DNI",
45 | )
46 | # NIE
47 | for item in self.nie.finditer(doc):
48 | item_value = item.group()
49 | if nie.is_valid(item_value):
50 | yield PiiEntity(
51 | PiiEnum.GOV_ID,
52 | item.start(),
53 | item_value,
54 | country=self.country,
55 | name="Spanish NIE",
56 | )
57 |
58 |
59 | # Task descriptor
60 | PII_TASKS = [(PiiEnum.GOV_ID, SpanishDniNie)]
61 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/mx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/es/mx/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/es/mx/curp.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection and validation of Clave Única de Registro de Población for Mexico
3 |
4 | It contains two check digits, so it can be validated.
5 | """
6 |
7 | import re
8 |
9 | from stdnum.mx import curp as stdnum_curp
10 |
11 | from typing import Iterable
12 |
13 | from pii_manager import PiiEnum
14 |
15 |
16 | _CURP_PATTERN = r"[A-Z] [AEIOU] [A-Z]{2} \d{6} [HM] [A-Z]{5} [0-9A-Z] \d"
17 | _CURP_REGEX = re.compile(_CURP_PATTERN, flags=re.X)
18 |
19 |
20 | def curp(doc: str) -> Iterable[str]:
21 | """
22 | Mexican Clave Única de Registro de Población (detect and validate)
23 | """
24 | for candidate in _CURP_REGEX.findall(doc):
25 | if stdnum_curp.is_valid(candidate):
26 | yield candidate
27 |
28 |
29 | PII_TASKS = [(PiiEnum.GOV_ID, curp)]
30 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/fr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/fr/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/fr/ca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/fr/ca/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/fr/ca/social_insurance_number.py:
--------------------------------------------------------------------------------
1 | """
2 | Reuse the SIN code implemented for en
3 | """
4 | from pii_manager.lang.en.ca.social_insurance_number import PII_TASKS
5 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/pt/__init__.py:
--------------------------------------------------------------------------------
1 | # Folder for language-independent tasks
2 | LANG_ANY = "any"
3 |
4 | # Country-independent tasks
5 | COUNTRY_ANY = "any"
6 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/pt/br/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/pt/br/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/pt/br/cpf.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection and validation of the identifier for Brazilian Cadastro de Pessoa
3 | Física
4 |
5 | It contains two check digits, so it can be validated.
6 | """
7 |
8 | import re
9 |
10 | from stdnum.br import cpf
11 |
12 | from typing import Iterable
13 |
14 | from pii_manager import PiiEnum
15 |
16 |
17 | _CPF_REGEX = re.compile(r"\d{3} \. \d{3} \. \d{3} - \d{2}", flags=re.X)
18 |
19 |
20 | def cadastro_pessoa_fisica(doc: str) -> Iterable[str]:
21 | """
22 | Brazilian número de inscrição no Cadastro de Pessoas Físicas (detect and validate)
23 | """
24 | for candidate in _CPF_REGEX.findall(doc):
25 | if cpf.is_valid(candidate):
26 | yield candidate
27 |
28 |
29 | PII_TASKS = [(PiiEnum.GOV_ID, cadastro_pessoa_fisica)]
30 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/pt/pt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/pt/pt/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/pt/pt/govid.py:
--------------------------------------------------------------------------------
1 | """
2 | Portuguese Goverment-issued IDs:
3 | - NIF (Número de identificação fiscal)
4 | - CC (Número de Cartão de Cidadão)
5 | """
6 |
7 | import re
8 |
9 | from typing import Iterable
10 |
11 | from stdnum.pt import nif, cc
12 |
13 | from pii_manager import PiiEnum, PiiEntity
14 | from pii_manager.helper import BasePiiTask
15 |
16 |
17 | # regex for NIF & CC
18 | _NIF_PATTERN = r"(?: PT \x20?)? (?: \d{3} \x20 \d{3} \x20 \d{3} | \d{9} )"
19 | _CC_PATTERN = r"\d{8} \x20? \d \x20? [A-Z0-9]{2}\d"
20 |
21 |
22 | class PortugueseNifCc(BasePiiTask):
23 | """
24 | Portuguese Government-issued NIF & CC numbers, recognize & validate
25 | """
26 |
27 | pii_name = "Portuguese NIF and CC numbers"
28 |
29 | def __init__(self, **kwargs):
30 | super().__init__(**kwargs)
31 | # Compile the regexes
32 | self.nif = re.compile(_NIF_PATTERN, flags=re.X)
33 | self.cc = re.compile(_CC_PATTERN, flags=re.X)
34 |
35 | def find(self, doc: str) -> Iterable[PiiEntity]:
36 | # NIF
37 | for item in self.nif.finditer(doc):
38 | item_value = item.group()
39 | if nif.is_valid(item_value):
40 | yield PiiEntity(
41 | PiiEnum.GOV_ID,
42 | item.start(),
43 | item_value,
44 | country=self.country,
45 | name="Portuguese NIF",
46 | )
47 | # CC
48 | for item in self.cc.finditer(doc):
49 | item_value = item.group()
50 | if cc.is_valid(item_value):
51 | yield PiiEntity(
52 | PiiEnum.GOV_ID,
53 | item.start(),
54 | item_value,
55 | country=self.country,
56 | name="Portuguese CC",
57 | )
58 |
59 |
60 | # Task descriptor
61 | PII_TASKS = [(PiiEnum.GOV_ID, PortugueseNifCc)]
62 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/zh/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/zh/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/zh/cn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/data_tooling/388499d47750f65ea11dce6465bb858858f30abc/pii-manager/src/pii_manager/lang/zh/cn/__init__.py
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/zh/cn/gov_id.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of various government-issued IDs for China:
3 | - Resident Identification Card number (this can be validated)
4 | - Passport number (this cannot)
5 | """
6 |
7 | import re
8 | from typing import Iterable
9 |
10 | from pii_manager import PiiEnum
11 |
12 | from stdnum.cn import ric
13 |
14 |
15 | # Detect candidates (separately) for RIC and passport-like numbers
16 | _GOV_ID_PATTERN = r"(? Iterable[str]:
23 | """
24 | Chinese government-issued identifiers:
25 | - RIC (Resident Identification Card number), detect and validate
26 | - Passport number, detect only
27 | """
28 | for g in _GOV_ID_REGEX.finditer(doc):
29 | if g.group(1) and ric.is_valid(g.group(1)):
30 | yield g.group(1)
31 | elif g.group(2):
32 | yield g.group(2)
33 |
34 |
35 | PII_TASKS = [(PiiEnum.GOV_ID, ric_or_passport)]
36 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/lang/zh/cn/misc.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of various Chinese PII elements
3 | """
4 |
5 |
6 | from pii_manager import PiiEnum
7 |
8 |
9 | _PATTERNS = {
10 | "STREET_ADDRESS": r"""(\p{Han}{1,4} (自治区|省))?
11 | \p{Han}{1,4}
12 | ((?"
32 |
33 | def __eq__(self, other):
34 | return (
35 | self.elem == other.elem
36 | and self.pos == other.pos
37 | and self.value == other.value
38 | and self.country == other.country
39 | and self.name == other.name
40 | )
41 |
42 | def to_json(self) -> Dict:
43 | """
44 | Return the object data as a dict that can then be serialised as JSON
45 | """
46 | return piientity_asdict(self)
47 |
48 |
49 | def piientity_asdict(pii: PiiEntity, name: bool = None, country: bool = None) -> Dict:
50 | """
51 | Create a dictionary from a PiiEntity object
52 | :param country: add country information: always (True), never (False),
53 | only if defined (None)
54 | """
55 | n = {"name": pii.name} if name or country is None and pii.name else {}
56 | d = {"type": pii.elem.name, **n, "value": pii.value, "pos": pii.pos}
57 | if country or country is None and pii.country:
58 | d["country"] = pii.country
59 | return d
60 |
--------------------------------------------------------------------------------
/pii-manager/src/pii_manager/piienum.py:
--------------------------------------------------------------------------------
1 | """
2 | Enumeration that contains all defined PII elements
3 |
4 | Order is significant, in the sense that, on an processing job, tasks coming
5 | earlier in the enum will be tried first. Hence the more generic tasks (tasks
6 | that might collide with more specific ones) should come last
7 | """
8 |
9 | from enum import Enum, auto
10 |
11 |
12 | class PiiEnum(str, Enum):
13 | CREDIT_CARD = auto()
14 | BITCOIN_ADDRESS = auto()
15 | IP_ADDRESS = auto()
16 | EMAIL_ADDRESS = auto()
17 | AGE = auto()
18 | BIRTH_DATE = auto()
19 | DEATH_DATE = auto()
20 | NORP = auto()
21 | DISEASE = auto()
22 | BANK_ACCOUNT = auto()
23 | GOV_ID = auto()
24 | PHONE_NUMBER = auto()
25 | LICENSE_PLATE = auto()
26 | STREET_ADDRESS = auto()
27 |
--------------------------------------------------------------------------------
/pii-manager/test/data/extract-block.ndjson:
--------------------------------------------------------------------------------
1 | {"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}
2 | {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86}
3 |
--------------------------------------------------------------------------------
/pii-manager/test/data/extract-line.ndjson:
--------------------------------------------------------------------------------
1 | {"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25, "line": 1}
2 | {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 10, "line": 2}
3 |
--------------------------------------------------------------------------------
/pii-manager/test/data/extract-sentence.ndjson:
--------------------------------------------------------------------------------
1 | {"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25, "sentence": 1}
2 | {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86, "sentence": 1}
3 |
--------------------------------------------------------------------------------
/pii-manager/test/data/full-block.ndjson:
--------------------------------------------------------------------------------
1 | {"text": "My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs\nstored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. This one, however, is not a\ncredit card number: 4273 9666 4581 5643\n", "entities": [{"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}, {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86}]}
2 |
--------------------------------------------------------------------------------
/pii-manager/test/data/full-line.ndjson:
--------------------------------------------------------------------------------
1 | {"text": "My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs\n", "entities": [{"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}]}{"text": "stored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. This one, however, is not a\n", "entities": [{"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 10}]}{"text": "credit card number: 4273 9666 4581 5643\n", "entities": []}
2 |
--------------------------------------------------------------------------------
/pii-manager/test/data/full-sentence.ndjson:
--------------------------------------------------------------------------------
1 | {"text": "My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs\nstored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. ", "entities": [{"type": "CREDIT_CARD", "name": "credit card", "value": "4273 9666 4581 5642", "pos": 25}, {"type": "BITCOIN_ADDRESS", "name": "bitcoin address", "value": "1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i", "pos": 86}]}{"text": "This one, however, is not a\ncredit card number: 4273 9666 4581 5643\n", "entities": []}
2 |
--------------------------------------------------------------------------------
/pii-manager/test/data/orig.txt:
--------------------------------------------------------------------------------
1 | My credit card number is 4273 9666 4581 5642 and I have used it to buy BTCs
2 | stored at 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i. This one, however, is not a
3 | credit card number: 4273 9666 4581 5643
4 |
--------------------------------------------------------------------------------
/pii-manager/test/data/replace.txt:
--------------------------------------------------------------------------------
1 | My credit card number is and I have used it to buy BTCs
2 | stored at . This one, however, is not a
3 | credit card number: 4273 9666 4581 5643
4 |
--------------------------------------------------------------------------------
/pii-manager/test/data/tag.txt:
--------------------------------------------------------------------------------
1 | My credit card number is and I have used it to buy BTCs
2 | stored at . This one, however, is not a
3 | credit card number: 4273 9666 4581 5643
4 |
--------------------------------------------------------------------------------
/pii-manager/test/data/taskfile-error.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "pii": "IP_ADDRESS",
4 | "type": "regex",
5 | "task": "\\b (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \\. ){3} (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]?) \\b"
6 | },
7 | {
8 | "pii": "NOT_A_VALID_PII_TASK_CLASS",
9 | "type": "call",
10 | "task": "pii_manager.lang.any.bitcoin_address.bitcoin_address"
11 | },
12 | {
13 | "pii": "CREDIT_CARD",
14 | "type": "PiiClass",
15 | "task": "pii_manager.lang.any.credit_card.CreditCard"
16 | }
17 | ]
18 |
--------------------------------------------------------------------------------
/pii-manager/test/data/taskfile.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "pii": "IP_ADDRESS",
4 | "lang": "any",
5 | "type": "regex",
6 | "task": "\\b (?: (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]? ) \\. ){3} (?: 25[0-5] | 2[0-4][0-9] | [01]?[0-9][0-9]?) \\b",
7 | "doc": "ip address detection via regex"
8 | },
9 | {
10 | "pii": "BITCOIN_ADDRESS",
11 | "lang": "any",
12 | "type": "callable",
13 | "task": "pii_manager.lang.any.bitcoin_address.bitcoin_address",
14 | "doc": "bitcoin address detection"
15 | },
16 | {
17 | "pii": "CREDIT_CARD",
18 | "lang": "en",
19 | "type": "PiiTask",
20 | "task": "pii_manager.lang.any.credit_card.CreditCard",
21 | "doc": "credit card number detection"
22 | }
23 | ]
24 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/api/test_manager.py:
--------------------------------------------------------------------------------
1 | from io import StringIO
2 |
3 | from pii_manager import PiiEnum
4 | from pii_manager.api import PiiManager
5 |
6 |
7 | TEST = (
8 | "El número de la tarjeta de crédito es 4273 9666 4581 5642",
9 | "El número de la tarjeta de crédito es ",
10 | )
11 |
12 |
13 | def test10_constructor():
14 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD)
15 | assert obj.tasks[0].pii == PiiEnum.CREDIT_CARD
16 | assert str(obj) == ""
17 |
18 |
19 | def test20_info():
20 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD)
21 | info = obj.task_info()
22 |
23 | exp = {
24 | (PiiEnum.CREDIT_CARD, None,): [
25 | (
26 | "credit card",
27 | "Credit card numbers for most international credit cards (detect & validate)",
28 | )
29 | ]
30 | }
31 | assert info == exp
32 |
33 |
34 | def test20_call():
35 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD)
36 | anon = obj(TEST[0])
37 | assert anon == TEST[1]
38 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/api/test_manager_ctx.py:
--------------------------------------------------------------------------------
1 | """
2 | Test base objects with context
3 | """
4 |
5 | from pii_manager import PiiEnum, PiiEntity
6 | from pii_manager.api import PiiManager
7 |
8 |
9 | def _pii(pos):
10 | return PiiEntity(PiiEnum.GOV_ID, pos, "3451-K", country="vo", name="vogonian ID")
11 |
12 |
13 | TEST = [
14 | ("my Vogon ID is 3451-K", [_pii(15)]),
15 | ("the number 3451-K is my Vogonian ID", [_pii(11)]),
16 | ("the Vogon ID are 3451-K", []), # context outside window
17 | ("my Betelgeuse ID is 3451-K", []), # context does not match
18 | ]
19 |
20 |
21 | # ------------------------------------------------------------------------
22 |
23 | DUMMY_REGEX = {
24 | "pii": PiiEnum.GOV_ID,
25 | "type": "regex",
26 | "task": r"""\b\d{4}-\w\b""",
27 | "lang": "en",
28 | "name": "vogonian ID",
29 | "country": "vo",
30 | "doc": "a toy example to match a government id",
31 | "context": {"value": ["Vogon ID", "vogonian id"], "width": [12, 20]},
32 | }
33 |
34 |
35 | def test10_context_regex():
36 | """
37 | Check a PII task with contexts, regex variant
38 | """
39 | obj = PiiManager("en", mode="extract")
40 | obj.add_tasks([DUMMY_REGEX])
41 | for (text, exp) in TEST:
42 | got = obj(text)
43 | assert list(got) == exp
44 |
45 |
46 | # ------------------------------------------------------------------------
47 |
48 |
49 | DUMMY_CLASS = {
50 | "pii": PiiEnum.GOV_ID,
51 | "type": "PiiTask",
52 | "task": "unit.api.test_manager_add.DummyPii",
53 | "lang": "en",
54 | "country": "vo",
55 | "name": "vogonian ID",
56 | "doc": "a toy example to match a government id",
57 | "context": {"value": ["Vogon ID", "vogonian id"], "width": [12, 20]},
58 | }
59 |
60 |
61 | def test20_context_class():
62 | """
63 | Check a PII task with contexts, class variant
64 | """
65 | obj = PiiManager("en", mode="extract")
66 | obj.add_tasks([DUMMY_CLASS])
67 | for (text, exp) in TEST:
68 | got = obj(text)
69 | assert list(got) == exp
70 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/helper/test_base.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from pii_manager import PiiEnum, PiiEntity
4 | from pii_manager.helper.base import BasePiiTask
5 | from pii_manager.helper.exception import PiiUnimplemented, InvArgException
6 |
7 | import pii_manager.helper.base as mod
8 |
9 |
10 | def test10_base():
11 | """
12 | Create base object
13 | """
14 | task_spec = {"pii": PiiEnum.BITCOIN_ADDRESS, "lang": "es", "name": "example"}
15 | task = mod.BasePiiTask(**task_spec)
16 | assert task.pii == PiiEnum.BITCOIN_ADDRESS
17 | assert task.lang == "es"
18 | assert task.name == "example"
19 |
20 | with pytest.raises(PiiUnimplemented):
21 | task("blah")
22 |
23 |
24 | def test20_regex():
25 | """
26 | Test regex object
27 | """
28 | task_spec = {"pii": PiiEnum.CREDIT_CARD, "lang": "es", "name": "example"}
29 | task = mod.RegexPiiTask(r"\d{4}", **task_spec)
30 |
31 | got = list(task("number 1234 and number 3451"))
32 | exp = [
33 | PiiEntity(PiiEnum.CREDIT_CARD, 7, "1234", name="example"),
34 | PiiEntity(PiiEnum.CREDIT_CARD, 23, "3451", name="example"),
35 | ]
36 | assert exp == got
37 |
38 |
39 | def test30_callable():
40 | """
41 | Test callable object
42 | """
43 |
44 | def example(i: str):
45 | return ["1234", "3451"]
46 |
47 | task_spec = {"pii": PiiEnum.CREDIT_CARD, "lang": "es", "name": "example"}
48 | task = mod.CallablePiiTask(example, **task_spec)
49 |
50 | got = list(task("number 1234 and number 3451"))
51 | exp = [
52 | PiiEntity(PiiEnum.CREDIT_CARD, 7, "1234", name="example"),
53 | PiiEntity(PiiEnum.CREDIT_CARD, 23, "3451", name="example"),
54 | ]
55 | assert exp == got
56 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/helper/test_norm.py:
--------------------------------------------------------------------------------
1 | import pii_manager.helper.normalizer as mod
2 |
3 |
4 | TEST = [("the Social Security\nNumber is 34512", "the social security number is 34512")]
5 |
6 |
7 | def test10_normalizer():
8 | """
9 | Create base object
10 | """
11 | for (text, exp) in TEST:
12 | assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp
13 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/any/test_bitcoin_address.py:
--------------------------------------------------------------------------------
1 | """
2 | Test bitcoin addresses
3 | """
4 |
5 |
6 | from pii_manager import PiiEnum
7 | from pii_manager.api import PiiManager
8 |
9 |
10 | TEST = [
11 | # A valid bitcoin address
12 | (
13 | "BTC address: 1JayVxfVgdaFKirkZTZVK4CdRnFDdFNENN",
14 | "BTC address: ",
15 | ),
16 | (
17 | "BTC address: bc1qwxxvjxlakxe9rmxcphh4yy8a2t6z00k4gc4mpj",
18 | "BTC address: ",
19 | ),
20 | # An invalid bitcoin address
21 | (
22 | "BTC address: 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW623",
23 | "BTC address: 1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW623",
24 | ),
25 | ]
26 |
27 |
28 | def test10_credit_card():
29 | obj = PiiManager("en", None, PiiEnum.BITCOIN_ADDRESS)
30 | for doc, exp in TEST:
31 | got = obj(doc)
32 | assert got == exp
33 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/any/test_credit_card.py:
--------------------------------------------------------------------------------
1 | """
2 | Test credit card numbers
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 |
9 | TEST = [
10 | # A valid credit card number
11 | (
12 | "El número de la tarjeta de crédito es 4273 9666 4581 5642",
13 | "El número de la tarjeta de crédito es ",
14 | ),
15 | # Without spaces
16 | ("La tarjeta es 4273966645815642", "La tarjeta es "),
17 | # With text afterwards
18 | (
19 | "El número de la tarjeta es 4273 9666 4581 5642 probablemente",
20 | "El número de la tarjeta es probablemente",
21 | ),
22 | # With dashes
23 | (
24 | "mi tarjeta es 4273-9666-4581-5642 con caducidad 07/22",
25 | "mi tarjeta es con caducidad 07/22",
26 | ),
27 | # Too short
28 | (
29 | "El número de la tarjeta de crédito es 4273 9666 4581",
30 | "El número de la tarjeta de crédito es 4273 9666 4581",
31 | ),
32 | # Not a valid credit card number
33 | (
34 | "El número de la tarjeta de crédito es 4273 9666 4581 5641",
35 | "El número de la tarjeta de crédito es 4273 9666 4581 5641",
36 | ),
37 | ]
38 |
39 |
40 | def test10_credit_card():
41 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD)
42 | for doc, exp in TEST:
43 | got = obj(doc)
44 | assert exp == got
45 |
46 |
47 | def test20_credit_card_stats():
48 | obj = PiiManager("es", None, PiiEnum.CREDIT_CARD)
49 | for doc, exp in TEST:
50 | obj(doc)
51 | assert obj.stats == {"calls": 6, "CREDIT_CARD": 4}
52 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/any/test_email.py:
--------------------------------------------------------------------------------
1 | """
2 | Test email addersses
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 |
9 | TEST = [
10 | # A valid email address
11 | (
12 | "My email is anyone@whatever.com.",
13 | "My email is .",
14 | ),
15 | # An invalid email address
16 | (
17 | "My email is anyone@whatever.",
18 | "My email is anyone@whatever.",
19 | ),
20 | ]
21 |
22 |
23 | def test10_credit_card():
24 | obj = PiiManager("es", None, PiiEnum.EMAIL_ADDRESS)
25 | for doc, exp in TEST:
26 | got = obj(doc)
27 | assert exp == got
28 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/any/test_ip_address.py:
--------------------------------------------------------------------------------
1 | """
2 | Test IP addresses
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 |
9 | TEST = [
10 | # A valid IP address
11 | (
12 | "My IP address is 10.45.122.65",
13 | "My IP address is ",
14 | ),
15 | # An invalid IP address
16 | ("My IP address is 310.45.122.65", "My IP address is 310.45.122.65"),
17 | # An IP address without context
18 | ("My address is 10.45.122.65", "My address is 10.45.122.65"),
19 | ]
20 |
21 |
22 | def test10_ip_address():
23 | obj = PiiManager("en", None, PiiEnum.IP_ADDRESS)
24 | for doc, exp in TEST:
25 | got = obj(doc)
26 | assert exp == got
27 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/en/any/test_ipn_en.py:
--------------------------------------------------------------------------------
1 | """
2 | Test international phone numbers
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 | from pii_manager.lang import LANG_ANY
8 |
9 | TEST = [
10 | # Standard phone number
11 | ("phone number: +34 983 453 999", "phone number: "),
12 | ("phone number: +34983453999", "phone number: "),
13 | ("ph. +34983453999", "ph. "),
14 | # An invalid country code
15 | ("phone number: +99 983 453 999", "phone number: +99 983 453 999"),
16 | # No valid contexts
17 | ("number: +34983453999", "number: +34983453999"),
18 | ("phonograph +34983453999", "phonograph +34983453999"),
19 | ]
20 |
21 |
22 | def test10_ssn():
23 | obj = PiiManager("en", LANG_ANY, PiiEnum.PHONE_NUMBER)
24 | for doc, exp in TEST:
25 | got = obj(doc)
26 | assert got == exp
27 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/en/au/test_abn.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Australian Business Number
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid ABN
10 | ("business number: 83 914 571 673.", "business number: ."),
11 | # ABN without spaces
12 | ("business number: 83914571673.", "business number: ."),
13 | # An invalid ABN
14 | ("not an ABN: 83 914 571 679", "not an ABN: 83 914 571 679"),
15 | ]
16 |
17 |
18 | def test10_abn():
19 | obj = PiiManager("en", "AU", PiiEnum.GOV_ID)
20 | for doc, exp in TEST:
21 | got = obj(doc)
22 | assert got == exp
23 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/en/au/test_tfn.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Australian Tax File Number
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid ABN
10 | ("tax file number: 963 553 151.", "tax file number: ."),
11 | ("the tfn is: 123 456 782", "the tfn is: "),
12 | # TFN without spaces
13 | ("tax file number: 963553151.", "tax file number: ."),
14 | # An invalid TFN
15 | ("not a TFN: 123 456 781", "not a TFN: 123 456 781"),
16 | ]
17 |
18 |
19 | def test10_abn():
20 | obj = PiiManager("en", "AU", PiiEnum.GOV_ID)
21 | for doc, exp in TEST:
22 | got = obj(doc)
23 | assert got == exp
24 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/en/ca/test_sin.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Canadian Social Insurance Number
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid SIN
10 | ("SIN: 963-553-151", "SIN: "),
11 | # SIN with spaces
12 | ("SIN: 339 892 317 number", "SIN: number"),
13 | # An invalid SIN
14 | ("not a SIN: 123-456-781", "not a SIN: 123-456-781"),
15 | ]
16 |
17 |
18 | def test10_ssn():
19 | obj = PiiManager("en", "CA", PiiEnum.GOV_ID)
20 | for doc, exp in TEST:
21 | got = obj(doc)
22 | assert got == exp
23 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/en/in_/test_aadhaar.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Indian Aadhaar Number
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid aadhaar
10 | ("aadhaar number 234123412346", "aadhaar number "),
11 | # aadhaar with spaces
12 | ("aadhaar number 2341 2341 2346", "aadhaar number "),
13 | # An invalid aadhaar
14 | (
15 | "not a real aadhaar number: 2341 2341 2347",
16 | "not a real aadhaar number: 2341 2341 2347",
17 | ),
18 | ]
19 |
20 |
21 | def test10_ssn():
22 | obj = PiiManager("en", "IN", PiiEnum.GOV_ID)
23 | for doc, exp in TEST:
24 | got = obj(doc)
25 | assert got == exp
26 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/en/us/test_ssn.py:
--------------------------------------------------------------------------------
1 | """
2 | Test US Social Security Number
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid SSN
10 | ("SSN: 536-90-4399", "SSN: "),
11 | # SSN with spaces
12 | ("SSN: 536 90 4399", "SSN: "),
13 | # An invalid SSN
14 | ("not a SSN: 666-90-4399", "not a SSN: 666-90-4399"),
15 | ]
16 |
17 |
18 | def test10_ssn():
19 | obj = PiiManager("en", "US", PiiEnum.GOV_ID)
20 | for doc, exp in TEST:
21 | got = obj(doc)
22 | assert got == exp
23 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/es/any/test_ipn_es.py:
--------------------------------------------------------------------------------
1 | """
2 | Test international phone numbers
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 | from pii_manager.lang import LANG_ANY
8 |
9 | TEST = [
10 | # Standard phone number
11 | ("teléfono: +34 983 453 999", "teléfono: "),
12 | ("tf. +34983453999", "tf. "),
13 | ("numero de telefono +34983453999", "numero de telefono "),
14 | # An invalid country code
15 | ("teléfono: +99 983 453 999", "teléfono: +99 983 453 999"),
16 | # No valid contexts
17 | ("número: +34983453999", "número: +34983453999"),
18 | ("tff +34983453999", "tff +34983453999"),
19 | ]
20 |
21 |
22 | def test10_ssn():
23 | obj = PiiManager("es", LANG_ANY, PiiEnum.PHONE_NUMBER)
24 | for doc, exp in TEST:
25 | got = obj(doc)
26 | assert got == exp
27 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/es/es/test_bank_account.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Spanish Bank Accounts
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid bank account number
10 | (
11 | "Código cuenta cliente: 2085 8720 60 1902070563",
12 | "Código cuenta cliente: ",
13 | ),
14 | # No spaces
15 | (
16 | "Código cuenta cliente: 20858720601902070563",
17 | "Código cuenta cliente: ",
18 | ),
19 | # An invalid bank account number
20 | (
21 | "Código cuenta cliente: 2085 8720 44 1902070563",
22 | "Código cuenta cliente: 2085 8720 44 1902070563",
23 | ),
24 | ]
25 |
26 |
27 | def test10_bank_account():
28 | obj = PiiManager("es", "ES", PiiEnum.BANK_ACCOUNT)
29 | for doc, exp in TEST:
30 | got = obj(doc)
31 | assert got == exp
32 |
33 |
34 | def test20_bank_account_undefined():
35 | """
36 | Test under another country (hence it will NOT be defined)
37 | """
38 | obj = PiiManager("es", "FR", PiiEnum.BANK_ACCOUNT)
39 | for doc, exp in TEST:
40 | got = obj(doc)
41 | assert got == doc
42 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/es/es/test_govid_es_es.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Spanish DNI & NIE
3 | """
4 |
5 | from pii_manager import PiiEnum, PiiEntity
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid DNI
10 | (
11 | "Mi DNI es 34657934-Q",
12 | "Mi DNI es ",
13 | [PiiEntity(PiiEnum.GOV_ID, 10, "34657934-Q", "es", "Spanish DNI")],
14 | ),
15 | # A DNI without dash
16 | (
17 | "El DNI 34657934Q es válido",
18 | "El DNI es válido",
19 | [PiiEntity(PiiEnum.GOV_ID, 7, "34657934Q", "es", "Spanish DNI")],
20 | ),
21 | # A valid NIE
22 | (
23 | "El NIE es X3465793-S",
24 | "El NIE es ",
25 | [PiiEntity(PiiEnum.GOV_ID, 10, "X3465793-S", "es", "Spanish NIE")],
26 | ),
27 | # An invalid DNI
28 | ("Mi DNI es 34657934-H", "Mi DNI es 34657934-H", []),
29 | ]
30 |
31 |
32 | def test10_dni():
33 | obj = PiiManager("es", "ES", PiiEnum.GOV_ID)
34 | for doc, exp, _ in TEST:
35 | got = obj(doc)
36 | assert got == exp
37 |
38 |
39 | def test20_dni_extract():
40 | obj = PiiManager("es", "ES", PiiEnum.GOV_ID, mode="extract")
41 | for doc, _, exp in TEST:
42 | got = list(obj(doc))
43 | assert got == exp
44 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/es/mx/test_govid_es_mx.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Mexican CURP
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid CURP
10 | ("Mi número de CURP es PEPP700101HASRRD09", "Mi número de CURP es "),
11 | # An invalid CURP
12 | (
13 | "Mi número de CURP es PEPP700101HASRRD01",
14 | "Mi número de CURP es PEPP700101HASRRD01",
15 | ),
16 | ]
17 |
18 |
19 | def test10_curp():
20 | obj = PiiManager("es", "MX", PiiEnum.GOV_ID)
21 | for doc, exp in TEST:
22 | got = obj(doc)
23 | assert got == exp
24 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/pt/br/test_govid_pt_br.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Brazilian CPF
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid CPF
10 | ("O número do CPF é 263.946.533-30", "O número do CPF é "),
11 | # An invalid CPF
12 | ("O número do CPF é 000.000.000-12", "O número do CPF é 000.000.000-12"),
13 | ]
14 |
15 |
16 | def test10_cpf():
17 | obj = PiiManager("pt", "BR", PiiEnum.GOV_ID)
18 | for doc, exp in TEST:
19 | got = obj(doc)
20 | assert got == exp
21 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/pt/pt/test_govid_pt_pt.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Portuguese NIF & CC
3 | """
4 |
5 | from pii_manager import PiiEnum, PiiEntity
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid NIF
10 | (
11 | "Meu NIF é PT 123 456 789",
12 | "Meu NIF é ",
13 | [PiiEntity(PiiEnum.GOV_ID, 10, "PT 123 456 789", "pt", "Portuguese NIF")],
14 | ),
15 | # A NIF without spacing or prefix
16 | (
17 | "O NIF 123456789 é valido",
18 | "O NIF é valido",
19 | [PiiEntity(PiiEnum.GOV_ID, 6, "123456789", "pt", "Portuguese NIF")],
20 | ),
21 | # A valid CC
22 | (
23 | "O CC é 00000000 0 ZZ4",
24 | "O CC é ",
25 | [PiiEntity(PiiEnum.GOV_ID, 7, "00000000 0 ZZ4", "pt", "Portuguese CC")],
26 | ),
27 | # An invalid NIF
28 | ("Meu NIF é PT 123 456 788", "Meu NIF é PT 123 456 788", []),
29 | ]
30 |
31 |
32 | def test10_nif_cc():
33 | obj = PiiManager("pt", "PT", PiiEnum.GOV_ID)
34 | for doc, exp, _ in TEST:
35 | got = obj(doc)
36 | assert got == exp
37 |
38 |
39 | def test20_nif_cc_extract():
40 | obj = PiiManager("pt", "PT", PiiEnum.GOV_ID, mode="extract")
41 | for doc, _, exp in TEST:
42 | got = list(obj(doc))
43 | assert got == exp
44 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/zh/cn/test_govid_zh_cn.py:
--------------------------------------------------------------------------------
1 | """
2 | Test Chinese government ids (Resident Identity Card & Passport)
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # A valid RIC
10 | ("公民身份号码 360426199101010071", "公民身份号码 "),
11 | # An invalid RIC
12 | ("公民身份号码 360426199101010072", "公民身份号码 360426199101010072"),
13 | # An invalid RIC (one aditional digit)
14 | ("公民身份号码 3604261991010100717", "公民身份号码 3604261991010100717"),
15 | # A correct passport number
16 | ("中华人民共和国护照 D12345678", "中华人民共和国护照 "),
17 | # An incorrect passport number (invalid letter)
18 | ("中华人民共和国护照 K12345678", "中华人民共和国护照 K12345678"),
19 | # An incorrect passport number (only 7 digits)
20 | ("中华人民共和国护照 D1234567", "中华人民共和国护照 D1234567"),
21 | ]
22 |
23 |
24 | def test10_ssn():
25 | obj = PiiManager("zh", "CN", PiiEnum.GOV_ID)
26 | for doc, exp in TEST:
27 | got = obj(doc)
28 | assert got == exp
29 |
--------------------------------------------------------------------------------
/pii-manager/test/unit/lang/zh/cn/test_misc.py:
--------------------------------------------------------------------------------
1 | """
2 | Test PII elements for Chinese (Phone numbers, street addresses & diseases)
3 | """
4 |
5 | from pii_manager import PiiEnum
6 | from pii_manager.api import PiiManager
7 |
8 | TEST = [
9 | # Phone number
10 | ("045-4123456", ""),
11 | # Not a phone number (too many digits in the first part)
12 | ("70045-4123456", "70045-4123456"),
13 | # ----- We are missing here tests for STREET_ADDRESS & DISEASE
14 | ]
15 |
16 |
17 | def test10_ssn():
18 | obj = PiiManager(
19 | "zh", "CN", [PiiEnum.STREET_ADDRESS, PiiEnum.PHONE_NUMBER, PiiEnum.DISEASE]
20 | )
21 | for doc, exp in TEST:
22 | got = obj(doc)
23 | assert got == exp
24 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool]
2 | [tool.poetry]
3 | name = "data-tooling"
4 | version = "0.1.0"
5 | description = "Tools for managing datasets for governance and training."
6 | authors = ["BigScience "]
7 |
8 | [tool.poetry.dependencies]
9 | python = "^3.7.10"
10 |
11 | datasets = "^1.12.1"
12 | transformers = "^4.12.3"
13 | nltk = "^3.6.5"
14 | scikit-learn = "^1.0.1"
15 | fsspec = "^2021.11.0"
16 | kenlm = {url = "https://github.com/kpu/kenlm/archive/master.zip", optional = true}
17 | typer = "^0.4.0"
18 | regex = "^2021.11.10"
19 | simhash-py = "^0.4.0"
20 | PyYAML = "^6.0"
21 | tqdm = "^4.62.3"
22 |
23 | [tool.poetry.dev-dependencies]
24 | pdbpp = "^0.10.2"
25 | isort = "^5.6.4"
26 | flake8 = "^3.8.4"
27 | black = "^21.7b0"
28 | pytest = "^6.2.4"
29 | jupyterlab = "^3.0.16"
30 |
31 | [tool.poetry.extras]
32 | kenlm = ["kenlm"]
33 |
34 | [tool.isort]
35 | profile = 'black'
36 | treat_comments_as_code = "# %%"
37 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | dataset>=1.5.0
2 | datasets>=1.8.0
3 | fasttext>=0.9.2
4 | fsspec
5 | ftfy
6 | indexed_gzip>=1.6.1
7 | indexed_gzip>=1.6.1
8 | langid>=1.1.6
9 | nltk
10 | scikit-learn
11 | sentencepiece
12 | sqlalchemy>=1.4.20
13 | transformers
14 | wordfreq
15 |
--------------------------------------------------------------------------------
/tokenizer/python_script/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=1.18.0
2 | pyarrow>=6.0.0
3 |
--------------------------------------------------------------------------------
/tokenizer/scripts/01_remove_deplicated_lines.sh:
--------------------------------------------------------------------------------
1 | conda activate dedup-dataset
2 |
3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling
4 |
5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha-subset-12M
6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha-subset-12M-dedup-lines
7 |
8 | pushd $DATA_TOOLING_REPO
9 |
10 | export HF_DATASETS_OFFLINE=1
11 | export HF_DATASETS_CACHE=/home/lucile/to_delete
12 |
13 | python tokenizer/python_script/dedup_lines.py \
14 | --save-dir $SAVE_DATASET_DIR \
15 | --dataset_dir $DATASET_PATH \
16 | --batch-size 100 \
17 | --num-proc 3 \
18 | --min-chars 0 \
19 | --n-records 1000000 \
20 | --min-repetition-threshold 0
21 |
--------------------------------------------------------------------------------
/tokenizer/scripts/02_remove_duplicated_lines_dataset_with_dataset_source.sh:
--------------------------------------------------------------------------------
1 | conda activate dedup-dataset
2 |
3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling
4 |
5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/tokenization_dataset_v3_small_arrow
6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/tokenization_dataset_v3_small_arrow-dedup
7 |
8 | pushd $DATA_TOOLING_REPO
9 |
10 | export HF_DATASETS_OFFLINE=1
11 | export HF_DATASETS_CACHE=/home/lucile/to_delete
12 |
13 | python tokenizer/python_script/dedup_lines.py \
14 | --save-dir $SAVE_DATASET_DIR \
15 | --dataset_dir $DATASET_PATH \
16 | --batch-size 100 \
17 | --num-proc 3 \
18 | --min-chars 0 \
19 | --n-records 1000 \
20 | --min-repetition-threshold 0 \
21 | --preserve_code \
22 | --with-meta-col
23 |
--------------------------------------------------------------------------------
/tokenizer/scripts/03_remove_duplicated_lines_alpha.sh:
--------------------------------------------------------------------------------
1 | conda activate dedup-dataset
2 |
3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling
4 |
5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_arrow
6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_arrow-dedup
7 |
8 | pushd $DATA_TOOLING_REPO
9 |
10 | export HF_DATASETS_OFFLINE=1
11 | export HF_DATASETS_CACHE=/home/lucile/to_delete
12 |
13 | python tokenizer/python_script/dedup_lines.py \
14 | --save-dir $SAVE_DATASET_DIR \
15 | --dataset_dir $DATASET_PATH \
16 | --batch-size 1000 \
17 | --num-proc 1 \
18 | --min-chars 0 \
19 | --n-records 57290988 \
20 | --min-repetition-threshold 0 \
21 | --preserve_code \
22 | --with-meta-col
23 |
--------------------------------------------------------------------------------
/tokenizer/scripts/04_remove_duplicated_lines_alpha _memory.sh:
--------------------------------------------------------------------------------
1 | conda activate dedup-dataset
2 |
3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling
4 |
5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_arrow
6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_arrow-dedup
7 |
8 | pushd $DATA_TOOLING_REPO
9 |
10 | export HF_DATASETS_OFFLINE=1
11 | export HF_DATASETS_CACHE=/home/lucile/to_delete
12 |
13 | python tokenizer/python_script/ram_dedup_lines.py \
14 | --save-dir $SAVE_DATASET_DIR \
15 | --dataset_dir $DATASET_PATH \
16 | --num-proc 1 \
17 | --batch-size 6000000 \
18 | --load-from-disk
19 |
--------------------------------------------------------------------------------
/tokenizer/scripts/05_remove_duplicated_lines_alpha __v2_memory.sh:
--------------------------------------------------------------------------------
1 | conda activate dedup-dataset
2 |
3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling
4 |
5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_v2
6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_v2_dedup
7 |
8 | pushd $DATA_TOOLING_REPO
9 |
10 | export HF_DATASETS_OFFLINE=1
11 | export HF_DATASETS_CACHE=/home/lucile/to_delete
12 |
13 | python tokenizer/python_script/ram_dedup_lines.py \
14 | --save-dir $SAVE_DATASET_DIR \
15 | --dataset_dir $DATASET_PATH \
16 | --num-proc 1 \
17 | --batch-size 6000000
18 |
--------------------------------------------------------------------------------
/tokenizer/scripts/06_dedup_exact_examples.sh:
--------------------------------------------------------------------------------
1 | conda activate dedup-dataset
2 |
3 | DATA_TOOLING_REPO=/home/lucile/code/data_tooling
4 |
5 | DATASET_PATH=/home/lucile/data/tokenization_dataset/alpha_v2_dedup
6 | SAVE_DATASET_DIR=/home/lucile/data/tokenization_dataset/alpha_v2_dedup_lines_and_article
7 |
8 | pushd $DATA_TOOLING_REPO
9 |
10 | export HF_DATASETS_OFFLINE=1
11 | export HF_DATASETS_CACHE=/home/lucile/to_delete
12 |
13 | python tokenizer/python_script/dedup_exact_article.py \
14 | --save-dir $SAVE_DATASET_DIR \
15 | --dataset_dir $DATASET_PATH \
16 | --num-proc 8 \
17 | --batch-size 6000000
18 |
--------------------------------------------------------------------------------