├── .cargo └── config.toml ├── .devcontainer ├── Dockerfile ├── README.md ├── devcontainer.json ├── noop.txt └── postInstall.sh ├── .flake8 ├── .github └── workflows │ ├── CI.yml │ └── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── documentation.yml │ ├── feature_request.yml │ └── question.yml ├── .gitignore ├── CITATION.cff ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── configs ├── c4-replication │ ├── README.md │ ├── mixer.yaml │ ├── reformat_trafilatura.py │ └── tagger.yaml ├── dolma-v1_5 │ ├── README.md │ ├── decontamination │ │ ├── README.md │ │ ├── fix_ids_type.py │ │ ├── step1_3-make-eval-set │ │ │ ├── option1.yaml │ │ │ ├── option2.yaml │ │ │ ├── option3.yaml │ │ │ └── ppl_v2.yaml │ │ ├── step1_4-create-bloom-filter │ │ │ ├── option1.yaml │ │ │ ├── option2.yaml │ │ │ ├── option3.yaml │ │ │ └── ppl_v2.yaml │ │ ├── step2-run-decontamination │ │ │ ├── books.yaml │ │ │ ├── c4.yaml │ │ │ ├── cc.yaml │ │ │ ├── peS2o.yaml │ │ │ ├── reddit.yaml │ │ │ ├── stack.yaml │ │ │ ├── wikibooks.yaml │ │ │ └── wikipedia.yaml │ │ ├── tokenize_v3.sh │ │ └── tokenize_v3_small.sh │ ├── eval-set.md │ ├── mixing │ │ ├── books.yaml │ │ ├── c4.yaml │ │ ├── cc-head.yaml │ │ ├── cc-middle.yaml │ │ ├── cc-tail.yaml │ │ ├── dedupe-test.yaml │ │ ├── pes2o.yaml │ │ ├── reddit.yaml │ │ ├── stack.yaml │ │ └── wiki.yaml │ ├── para_dedupe │ │ ├── c4.yaml │ │ └── cc-middle.yaml │ ├── sample │ │ ├── cc-head.yaml │ │ ├── cc-middle.yaml │ │ └── cc-tail.yaml │ ├── tagger-r2.yaml │ ├── tokenizer.yaml │ └── train-set.md ├── dolma-v1_6 │ ├── decontamination │ │ ├── README.md │ │ ├── fix_ids_type.py │ │ ├── step1_3-make-eval-set │ │ │ ├── option1.yaml │ │ │ ├── option2.yaml │ │ │ ├── option3.yaml │ │ │ └── ppl_v2.yaml │ │ ├── step1_4-create-bloom-filter │ │ │ ├── option2_docs.yaml │ │ │ └── option2_para.yaml │ │ ├── step2-run-decontamination │ │ │ ├── dolma-v1_6_docs.yaml │ │ │ └── dolma-v1_6_para.yaml │ │ ├── step3_mixing │ │ │ └── dolma-v1_6_decon.yaml │ │ ├── tokenize_v3.sh │ │ └── tokenize_v3_small.sh │ ├── doc_dedupe │ │ ├── cc_en_head.yaml │ │ ├── cc_en_middle.yaml │ │ ├── cc_en_tail_part1.yaml │ │ ├── cc_en_tail_part2.yaml │ │ └── cc_en_tail_part3.yaml │ ├── mixing │ │ ├── books.yaml │ │ ├── c4.yaml │ │ ├── cc-head.yaml │ │ ├── cc-middle.yaml │ │ ├── cc-tail.yaml │ │ ├── pes2o.yaml │ │ ├── reddit.yaml │ │ ├── stack.yaml │ │ └── wiki.yaml │ ├── sample.yaml │ ├── sample │ │ ├── cc-head.yaml │ │ ├── cc-middle.yaml │ │ └── cc-tail.yaml │ ├── tokenizer.yaml │ └── tokenizer_v16_sc.yaml ├── dolma-v1_7 │ ├── v1_5-baseline │ │ ├── 300g_sample.yaml │ │ ├── 300g_tok.yml │ │ └── step2-run-decontamination │ │ │ ├── dolma-v1_5_docs.yaml │ │ │ └── dolma-v1_5_para.yaml │ ├── v1_6-baseline │ │ ├── 300g_sample.yml │ │ ├── 300g_tok.yml │ │ └── tok_per_source.sh │ └── v1_7-blocklist │ │ ├── 300g_sample.yml │ │ └── 300g_tok.yml ├── dolma2-resharding │ ├── all-dressed │ │ ├── config │ │ │ ├── alex │ │ │ │ ├── adult_content.yaml │ │ │ │ ├── art_and_design.yaml │ │ │ │ ├── crime_and_law.yaml │ │ │ │ ├── education_and_jobs.yaml │ │ │ │ ├── electronics_and_hardware.yaml │ │ │ │ ├── entertainment.yaml │ │ │ │ ├── fashion_and_beauty.yaml │ │ │ │ ├── finance_and_business.yaml │ │ │ │ ├── food_and_dining.yaml │ │ │ │ ├── games.yaml │ │ │ │ ├── health.yaml │ │ │ │ ├── history_and_geography.yaml │ │ │ │ ├── home_and_hobbies.yaml │ │ │ │ ├── industrial.yaml │ │ │ │ ├── literature.yaml │ │ │ │ ├── politics.yaml │ │ │ │ ├── religion.yaml │ │ │ │ ├── science_math_and_technology.yaml │ │ │ │ ├── social_life.yaml │ │ │ │ ├── software.yaml │ │ │ │ ├── software_development.yaml │ │ │ │ ├── sports_and_fitness.yaml │ │ │ │ ├── transportation.yaml │ │ │ │ └── travel_and_tourism.yaml │ │ │ ├── arithmetic │ │ │ │ ├── adult_content.yaml │ │ │ │ ├── art_and_design.yaml │ │ │ │ ├── crime_and_law.yaml │ │ │ │ ├── education_and_jobs.yaml │ │ │ │ ├── electronics_and_hardware.yaml │ │ │ │ ├── entertainment.yaml │ │ │ │ ├── fashion_and_beauty.yaml │ │ │ │ ├── finance_and_business.yaml │ │ │ │ ├── food_and_dining.yaml │ │ │ │ ├── games.yaml │ │ │ │ ├── health.yaml │ │ │ │ ├── history_and_geography.yaml │ │ │ │ ├── home_and_hobbies.yaml │ │ │ │ ├── industrial.yaml │ │ │ │ ├── literature.yaml │ │ │ │ ├── politics.yaml │ │ │ │ ├── religion.yaml │ │ │ │ ├── science_math_and_technology.yaml │ │ │ │ ├── social_life.yaml │ │ │ │ ├── software.yaml │ │ │ │ ├── software_development.yaml │ │ │ │ ├── sports_and_fitness.yaml │ │ │ │ ├── transportation.yaml │ │ │ │ └── travel_and_tourism.yaml │ │ │ ├── geometric │ │ │ │ ├── adult_content.yaml │ │ │ │ ├── art_and_design.yaml │ │ │ │ ├── crime_and_law.yaml │ │ │ │ ├── education_and_jobs.yaml │ │ │ │ ├── electronics_and_hardware.yaml │ │ │ │ ├── entertainment.yaml │ │ │ │ ├── fashion_and_beauty.yaml │ │ │ │ ├── finance_and_business.yaml │ │ │ │ ├── food_and_dining.yaml │ │ │ │ ├── games.yaml │ │ │ │ ├── health.yaml │ │ │ │ ├── history_and_geography.yaml │ │ │ │ ├── home_and_hobbies.yaml │ │ │ │ ├── industrial.yaml │ │ │ │ ├── literature.yaml │ │ │ │ ├── politics.yaml │ │ │ │ ├── religion.yaml │ │ │ │ ├── science_math_and_technology.yaml │ │ │ │ ├── social_life.yaml │ │ │ │ ├── software.yaml │ │ │ │ ├── software_development.yaml │ │ │ │ ├── sports_and_fitness.yaml │ │ │ │ ├── transportation.yaml │ │ │ │ └── travel_and_tourism.yaml │ │ │ ├── mayee │ │ │ │ ├── adult_content.yaml │ │ │ │ ├── art_and_design.yaml │ │ │ │ ├── crime_and_law.yaml │ │ │ │ ├── education_and_jobs.yaml │ │ │ │ ├── electronics_and_hardware.yaml │ │ │ │ ├── entertainment.yaml │ │ │ │ ├── fashion_and_beauty.yaml │ │ │ │ ├── finance_and_business.yaml │ │ │ │ ├── food_and_dining.yaml │ │ │ │ ├── games.yaml │ │ │ │ ├── health.yaml │ │ │ │ ├── history_and_geography.yaml │ │ │ │ ├── home_and_hobbies.yaml │ │ │ │ ├── industrial.yaml │ │ │ │ ├── literature.yaml │ │ │ │ ├── politics.yaml │ │ │ │ ├── religion.yaml │ │ │ │ ├── science_math_and_technology.yaml │ │ │ │ ├── social_life.yaml │ │ │ │ ├── software.yaml │ │ │ │ ├── software_development.yaml │ │ │ │ ├── sports_and_fitness.yaml │ │ │ │ ├── transportation.yaml │ │ │ │ └── travel_and_tourism.yaml │ │ │ ├── snazzy1 │ │ │ │ ├── adult_content.yaml │ │ │ │ ├── art_and_design.yaml │ │ │ │ ├── crime_and_law.yaml │ │ │ │ ├── education_and_jobs.yaml │ │ │ │ ├── electronics_and_hardware.yaml │ │ │ │ ├── entertainment.yaml │ │ │ │ ├── fashion_and_beauty.yaml │ │ │ │ ├── finance_and_business.yaml │ │ │ │ ├── food_and_dining.yaml │ │ │ │ ├── games.yaml │ │ │ │ ├── health.yaml │ │ │ │ ├── history_and_geography.yaml │ │ │ │ ├── home_and_hobbies.yaml │ │ │ │ ├── industrial.yaml │ │ │ │ ├── literature.yaml │ │ │ │ ├── politics.yaml │ │ │ │ ├── religion.yaml │ │ │ │ ├── science_math_and_technology.yaml │ │ │ │ ├── social_life.yaml │ │ │ │ ├── software.yaml │ │ │ │ ├── software_development.yaml │ │ │ │ ├── sports_and_fitness.yaml │ │ │ │ ├── transportation.yaml │ │ │ │ └── travel_and_tourism.yaml │ │ │ └── snazzy2 │ │ │ │ ├── adult_content.yaml │ │ │ │ ├── art_and_design.yaml │ │ │ │ ├── crime_and_law.yaml │ │ │ │ ├── education_and_jobs.yaml │ │ │ │ ├── electronics_and_hardware.yaml │ │ │ │ ├── entertainment.yaml │ │ │ │ ├── fashion_and_beauty.yaml │ │ │ │ ├── finance_and_business.yaml │ │ │ │ ├── food_and_dining.yaml │ │ │ │ ├── games.yaml │ │ │ │ ├── health.yaml │ │ │ │ ├── history_and_geography.yaml │ │ │ │ ├── home_and_hobbies.yaml │ │ │ │ ├── industrial.yaml │ │ │ │ ├── literature.yaml │ │ │ │ ├── politics.yaml │ │ │ │ ├── religion.yaml │ │ │ │ ├── science_math_and_technology.yaml │ │ │ │ ├── social_life.yaml │ │ │ │ ├── software.yaml │ │ │ │ ├── software_development.yaml │ │ │ │ ├── sports_and_fitness.yaml │ │ │ │ ├── transportation.yaml │ │ │ │ └── travel_and_tourism.yaml │ │ ├── generate.py │ │ ├── runners │ │ │ ├── alex.sh │ │ │ ├── arithmetic.sh │ │ │ ├── geometric.sh │ │ │ ├── mayee.sh │ │ │ ├── snazzy1.sh │ │ │ └── snazzy2.sh │ │ └── vigintiles │ │ │ ├── alex.json │ │ │ ├── alex.log │ │ │ ├── arithmetic.json │ │ │ ├── arithmetic.log │ │ │ ├── geometric.json │ │ │ ├── geometric.log │ │ │ ├── mayee.json │ │ │ ├── mayee.log │ │ │ ├── snazzy1.json │ │ │ ├── snazzy1.log │ │ │ ├── snazzy2.json │ │ │ └── snazzy2.log │ ├── dolma2-0625-v01.csv │ ├── dolma2-0625-v01.xlsx │ ├── dolma2-0625-v02.csv │ ├── dolma2-0625-v02.xlsx │ ├── final.py │ ├── s2orc │ │ ├── config │ │ │ ├── agricultural-and-food-sciences.yaml │ │ │ ├── art.yaml │ │ │ ├── biology.yaml │ │ │ ├── business.yaml │ │ │ ├── chemistry.yaml │ │ │ ├── computer-science.yaml │ │ │ ├── economics.yaml │ │ │ ├── education.yaml │ │ │ ├── engineering.yaml │ │ │ ├── environmental-science.yaml │ │ │ ├── geology.yaml │ │ │ ├── history.yaml │ │ │ ├── law.yaml │ │ │ ├── materials-science.yaml │ │ │ ├── mathematics.yaml │ │ │ ├── medicine.yaml │ │ │ ├── philosophy.yaml │ │ │ ├── physics.yaml │ │ │ ├── political-science.yaml │ │ │ └── psychology.yaml │ │ ├── full_pstar_7rep_dclm_stackedu_conditional.json │ │ ├── generate.log │ │ ├── generate.py │ │ └── run.sh │ ├── s2pdf-sep25 │ │ ├── config │ │ │ ├── adult_content.yaml │ │ │ ├── art_and_design.yaml │ │ │ ├── crime_and_law.yaml │ │ │ ├── education_and_jobs.yaml │ │ │ ├── electronics_and_hardware.yaml │ │ │ ├── entertainment.yaml │ │ │ ├── finance_and_business.yaml │ │ │ ├── food_and_dining.yaml │ │ │ ├── games.yaml │ │ │ ├── health.yaml │ │ │ ├── history_and_geography.yaml │ │ │ ├── home_and_hobbies.yaml │ │ │ ├── industrial.yaml │ │ │ ├── literature.yaml │ │ │ ├── politics.yaml │ │ │ ├── religion.yaml │ │ │ ├── science_math_and_technology.yaml │ │ │ ├── social_life.yaml │ │ │ ├── software.yaml │ │ │ ├── software_development.yaml │ │ │ ├── sports_and_fitness.yaml │ │ │ ├── transportation.yaml │ │ │ └── travel_and_tourism.yaml │ │ ├── full_pstar_7rep_dclm_stackedu_conditional.json │ │ ├── generate.log │ │ ├── generate.py │ │ └── run.sh │ ├── s2pdf │ │ ├── config │ │ │ ├── adult_content.yaml │ │ │ ├── art_and_design.yaml │ │ │ ├── crime_and_law.yaml │ │ │ ├── education_and_jobs.yaml │ │ │ ├── electronics_and_hardware.yaml │ │ │ ├── entertainment.yaml │ │ │ ├── finance_and_business.yaml │ │ │ ├── food_and_dining.yaml │ │ │ ├── games.yaml │ │ │ ├── health.yaml │ │ │ ├── history_and_geography.yaml │ │ │ ├── home_and_hobbies.yaml │ │ │ ├── industrial.yaml │ │ │ ├── literature.yaml │ │ │ ├── politics.yaml │ │ │ ├── religion.yaml │ │ │ ├── science_math_and_technology.yaml │ │ │ ├── social_life.yaml │ │ │ ├── software.yaml │ │ │ ├── software_development.yaml │ │ │ ├── sports_and_fitness.yaml │ │ │ ├── transportation.yaml │ │ │ └── travel_and_tourism.yaml │ │ ├── full_pstar_7rep_dclm_stackedu_conditional.json │ │ ├── generate.log │ │ ├── generate.py │ │ └── run.sh │ ├── smaller │ │ ├── config │ │ │ ├── arxiv.yaml │ │ │ ├── finemath-3plus.yaml │ │ │ └── wikipedia.yaml │ │ ├── generate.log │ │ └── generate.py │ └── stack-edu │ │ ├── config │ │ ├── C.yaml │ │ ├── CSharp.yaml │ │ ├── Cpp.yaml │ │ ├── Go.yaml │ │ ├── Java.yaml │ │ ├── JavaScript.yaml │ │ ├── Markdown.yaml │ │ ├── PHP.yaml │ │ ├── Python.yaml │ │ ├── Ruby.yaml │ │ ├── Rust.yaml │ │ ├── SQL.yaml │ │ ├── Shell.yaml │ │ ├── Swift.yaml │ │ └── TypeScript.yaml │ │ ├── generate.log │ │ ├── generate.py │ │ └── run.sh ├── pes2o-dedup │ └── pes2o_decontamination.json └── test │ ├── test_config_jq.yaml │ ├── test_config_jsonpath.yaml │ └── test_filtered_mixer.yaml ├── contrib ├── code-file-concat │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── README.md │ ├── src │ │ ├── concat.rs │ │ └── main.rs │ └── tests │ │ └── test_concat.py ├── fill-in-middle │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── README.md │ ├── src │ │ ├── fim.rs │ │ └── main.rs │ └── tests │ │ └── test_fim.py └── tokens-sanitizer │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── README.md │ ├── data │ └── input │ │ └── f1.jsonl │ ├── src │ └── main.rs │ └── tests │ └── test_sanitizer.py ├── docs ├── README.md ├── assets │ ├── AI2_Blog_1400x685.png │ ├── AI2_Blog_1400x685.webp │ ├── AI2_Blog_1400x685_2x.png │ ├── AI2_Blog_1400x685_2x.webp │ ├── DOLMA.webp │ ├── DOLMA_2x.png │ ├── DOLMA_4x.png │ ├── Small_655x120.png │ ├── Small_655x120_2x.png │ ├── Square_1_600x600.png │ ├── Square_1_600x600_2x.png │ ├── code-pipeline.pdf │ ├── code-pipeline.png │ ├── diagram.webp │ ├── dolma-v0_1-20230819.pdf │ ├── dolma-v1_6-20240131.pdf │ ├── web-pipeline.pdf │ └── web-pipeline.png ├── data-format.md ├── deduplication.md ├── develop.md ├── examples │ ├── dedupe-by-url.json │ ├── dedupe-paragraphs.json │ ├── wikipedia-mixer.json │ └── wikipedia-mixer.yaml ├── getting-started.md ├── mixer.md ├── parallel-processor.md ├── taggers.md └── tokenize.md ├── pyproject.toml ├── python └── dolma │ ├── __init__.py │ ├── cli │ ├── __init__.py │ ├── __main__.py │ ├── analyzer.py │ ├── deduper.py │ ├── main.py │ ├── mixer.py │ ├── resolvers.py │ ├── shared.py │ ├── tagger.py │ ├── tokenizer.py │ └── warc.py │ ├── core │ ├── __init__.py │ ├── analyzer.py │ ├── binning.py │ ├── data_types.py │ ├── errors.py │ ├── ft_dataset.py │ ├── ft_tagger.py │ ├── loggers.py │ ├── parallel.py │ ├── paths.py │ ├── registry.py │ ├── runtime.py │ ├── taggers.py │ ├── trainer.py │ ├── url_blocker.py │ ├── utils.py │ └── vizualizer.py │ ├── data │ ├── ext_to_lang_mapping.json │ └── naughty_words_en.txt │ ├── py.typed │ ├── taggers │ ├── __init__.py │ ├── c4.py │ ├── code │ │ ├── __init__.py │ │ ├── code_taggers.py │ │ ├── starcoder.py │ │ └── utils.py │ ├── code_composition.py │ ├── gopher.py │ ├── jigsaw.py │ ├── language.py │ ├── length.py │ ├── licenses.py │ ├── pii.py │ ├── punctuation.py │ ├── quality.py │ ├── repetitions │ │ ├── __init__.py │ │ ├── repetitions_taggers.py │ │ └── utils.py │ ├── sampling.py │ ├── tokenizers.py │ └── url.py │ ├── tokenizer │ ├── __init__.py │ ├── data_types.py │ ├── executor.py │ ├── memmap_writer.py │ ├── reshard.py │ └── tokenizer.py │ └── warc │ ├── __init__.py │ ├── linearizers.py │ ├── processor.py │ ├── record_info.py │ └── utils.py ├── scripts ├── attributes_heatmap.py ├── code_reasoning_ablations.py ├── code_reasoning_ablations_gsm8k_code.jsonl ├── dolma_paper_plots.sh ├── dolma_single_digit_tokenizer.py ├── dolma_stats.py ├── download_brave_domains.py ├── download_cloudflare_urls.py ├── download_hosts.ipynb ├── download_phishing_domains.py ├── download_url_blocklist.py ├── find_missing_attributes.py ├── find_offset.py ├── fix_dolma_v15_tokenizer.py ├── hash_sample.py ├── install_blingfire_macos.py ├── make_latex_fig_table.py ├── make_olmo2_tokenizer.py ├── make_wikipedia.py ├── match_links_wiki.ipynb ├── remove_empty_docs.py ├── sample_prefix.py ├── stack_correlation_table.py ├── stats_urls.py ├── tokenize_eval.sh ├── validate_mixer │ ├── README.md │ ├── __init__.py │ ├── config_handler.py │ ├── env_handler.py │ ├── file_operations.py │ ├── filter_operations.py │ ├── main.py │ ├── s3_utils.py │ ├── utils.py │ └── validator.py ├── wandb_run_vocab.yaml ├── wandb_to_plot.py └── wimbd_to_dolma.py ├── setup.sh ├── sources ├── cc_warc │ └── README.md ├── reddit │ ├── README.md │ ├── atomic_content_v3 │ │ ├── build_comment_data.py │ │ ├── build_submission_data.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ ├── subreddit_blocklist.txt │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── shared_utils.py │ ├── atomic_content_v5 │ │ ├── build_comment_data.py │ │ ├── build_submission_data.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ ├── subreddit_blocklist.txt │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── shared_utils.py │ ├── comment_threads_v1 │ │ ├── build_comment_data.py │ │ ├── build_submission_data.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ ├── subreddit_blocklist.txt │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── shared_utils.py │ ├── comment_threads_v2 │ │ ├── build_comment_data.py │ │ ├── build_submission_data.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ ├── subreddit_blocklist.txt │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── shared_utils.py │ └── complete_threads_codelike_v4 │ │ ├── build_combined_thread_data.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ ├── subreddit_blocklist.txt │ │ └── utils │ │ ├── __init__.py │ │ └── shared_utils.py └── starcoder │ ├── README.md │ ├── __init__.py │ ├── requirements.txt │ └── v0.py ├── src ├── bloom_filter.rs ├── bloom_filter │ └── bloom_test.rs ├── deduper.rs ├── filters.rs ├── io.rs ├── lib.rs ├── mixer.rs ├── s3_util.rs ├── shard.rs └── wimbd │ ├── io.rs │ ├── mod.rs │ ├── ngrams │ ├── counter.rs │ ├── mod.rs │ └── topk.rs │ ├── progress.rs │ ├── tokens.rs │ └── util.rs └── tests ├── config ├── c4-cleaned.json ├── dedupe-by-url.json ├── dedupe-paragraph-ngrams.json ├── dedupe-paragraphs.json ├── email-spans-jq.yaml ├── email-spans.json ├── filepath-bad.json ├── filepath-good.json ├── filter-by-spans.json ├── mixer-validator-jq.yaml ├── mixer-validator-jsonpath.yaml ├── mixer.json └── paragraph-spans.json ├── data ├── expected │ ├── dedupe-by-url.json.gz │ ├── dedupe-paragraph-ngrams.json.gz │ ├── dedupe-paragraphs.json.gz │ ├── email-spans-jq.json.gz │ ├── email-spans.json.gz │ ├── filter-by-spans.json.gz │ ├── mixer.json.gz │ └── remove-paragraphs.json.gz ├── formats │ ├── test.jsonl │ ├── test.jsonl.gz │ └── test.jsonl.zst ├── multiple_files │ ├── cc_en_head-0091.jsonl │ ├── cc_en_head-0091.jsonl.gz │ ├── cc_en_head-0174.jsonl │ └── cc_en_head-0174.jsonl.gz ├── provided │ ├── attributes │ │ ├── duplicate_paragraphs │ │ │ └── 000.json.gz │ │ ├── pii │ │ │ └── 000.json.gz │ │ ├── sample │ │ │ └── 000.json.gz │ │ └── toxicity │ │ │ └── 000.json.gz │ ├── deduper │ │ ├── documents │ │ │ └── 000.json.gz │ │ └── pathnotd0cumentz │ │ │ └── 000.json.gz │ └── documents │ │ └── 000.json.gz ├── tokenizer │ ├── dolma2-test-tokenizer.json │ ├── gpt-neo-test-tokenizer.json │ ├── llama-test-tokenizer.json │ └── llama3-test-tokenizer.json ├── urls │ └── easylist.txt.gz └── warc │ ├── sample-0000.warc.gz │ └── sample-0001.warc.gz └── python ├── __init__.py ├── conftest.py ├── extras ├── __init__.py ├── extras_from_module │ ├── __init__.py │ └── extra_taggers.py ├── extras_from_module_path │ ├── __init__.py │ └── extra_taggers.py ├── extras_from_path │ ├── __init__.py │ └── extra_taggers.py └── useful_extra │ └── __init__.py ├── test_analysis.py ├── test_binning.py ├── test_c4.py ├── test_code.py ├── test_code_composition.py ├── test_data_types.py ├── test_deduper.py ├── test_extra.py ├── test_gopher.py ├── test_language.py ├── test_length.py ├── test_license.py ├── test_mixer.py ├── test_nested_struct.py ├── test_omegaconf.py ├── test_parallel.py ├── test_paths.py ├── test_quality.py ├── test_registry.py ├── test_repetitions.py ├── test_repetitions_utils.py ├── test_resharding.py ├── test_runtime.py ├── test_tokenizer.py ├── test_urls.py ├── test_utils.py ├── test_warc.py ├── test_warc_record_info.py └── utils.py /.cargo/config.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.cargo/config.toml -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.devcontainer/Dockerfile -------------------------------------------------------------------------------- /.devcontainer/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.devcontainer/README.md -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.devcontainer/devcontainer.json -------------------------------------------------------------------------------- /.devcontainer/noop.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.devcontainer/noop.txt -------------------------------------------------------------------------------- /.devcontainer/postInstall.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.devcontainer/postInstall.sh -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.flake8 -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.github/workflows/CI.yml -------------------------------------------------------------------------------- /.github/workflows/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.github/workflows/ISSUE_TEMPLATE/bug_report.yml -------------------------------------------------------------------------------- /.github/workflows/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.github/workflows/ISSUE_TEMPLATE/documentation.yml -------------------------------------------------------------------------------- /.github/workflows/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.github/workflows/ISSUE_TEMPLATE/feature_request.yml -------------------------------------------------------------------------------- /.github/workflows/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.github/workflows/ISSUE_TEMPLATE/question.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/.gitignore -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/CITATION.cff -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/Cargo.lock -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/Cargo.toml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/README.md -------------------------------------------------------------------------------- /configs/c4-replication/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/c4-replication/README.md -------------------------------------------------------------------------------- /configs/c4-replication/mixer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/c4-replication/mixer.yaml -------------------------------------------------------------------------------- /configs/c4-replication/reformat_trafilatura.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/c4-replication/reformat_trafilatura.py -------------------------------------------------------------------------------- /configs/c4-replication/tagger.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/c4-replication/tagger.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/README.md: -------------------------------------------------------------------------------- 1 | # Dolma 1.5 2 | 3 | This directory 4 | -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/README.md -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/fix_ids_type.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/fix_ids_type.py -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/ppl_v2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/ppl_v2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikibooks.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/step2-run-decontamination/wikipedia.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/tokenize_v3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/tokenize_v3.sh -------------------------------------------------------------------------------- /configs/dolma-v1_5/decontamination/tokenize_v3_small.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/decontamination/tokenize_v3_small.sh -------------------------------------------------------------------------------- /configs/dolma-v1_5/eval-set.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/eval-set.md -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/books.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/books.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/c4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/c4.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/cc-head.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/cc-head.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/cc-middle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/cc-middle.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/cc-tail.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/cc-tail.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/dedupe-test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/dedupe-test.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/pes2o.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/pes2o.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/reddit.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/reddit.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/stack.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/mixing/wiki.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/mixing/wiki.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/para_dedupe/c4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/para_dedupe/c4.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/para_dedupe/cc-middle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/para_dedupe/cc-middle.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/sample/cc-head.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/sample/cc-head.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/sample/cc-middle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/sample/cc-middle.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/sample/cc-tail.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/sample/cc-tail.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/tagger-r2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/tagger-r2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/tokenizer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/tokenizer.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_5/train-set.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_5/train-set.md -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/README.md -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/fix_ids_type.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/fix_ids_type.py -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option1.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/option3.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step1_3-make-eval-set/ppl_v2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step1_3-make-eval-set/ppl_v2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_docs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_docs.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_para.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step1_4-create-bloom-filter/option2_para.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_docs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_docs.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_para.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step2-run-decontamination/dolma-v1_6_para.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/step3_mixing/dolma-v1_6_decon.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/step3_mixing/dolma-v1_6_decon.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/tokenize_v3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/tokenize_v3.sh -------------------------------------------------------------------------------- /configs/dolma-v1_6/decontamination/tokenize_v3_small.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/decontamination/tokenize_v3_small.sh -------------------------------------------------------------------------------- /configs/dolma-v1_6/doc_dedupe/cc_en_head.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/doc_dedupe/cc_en_head.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/doc_dedupe/cc_en_middle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/doc_dedupe/cc_en_middle.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/doc_dedupe/cc_en_tail_part1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part1.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/doc_dedupe/cc_en_tail_part2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part2.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/doc_dedupe/cc_en_tail_part3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/doc_dedupe/cc_en_tail_part3.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/books.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/books.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/c4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/c4.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/cc-head.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/cc-head.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/cc-middle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/cc-middle.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/cc-tail.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/cc-tail.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/pes2o.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/pes2o.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/reddit.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/reddit.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/stack.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/stack.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/mixing/wiki.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/mixing/wiki.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/sample.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/sample.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/sample/cc-head.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/sample/cc-head.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/sample/cc-middle.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/sample/cc-middle.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/sample/cc-tail.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/sample/cc-tail.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/tokenizer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/tokenizer.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_6/tokenizer_v16_sc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_6/tokenizer_v16_sc.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_5-baseline/300g_sample.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_5-baseline/300g_sample.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_5-baseline/300g_tok.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_5-baseline/300g_tok.yml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_docs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_docs.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_para.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_para.yaml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_6-baseline/300g_sample.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_6-baseline/300g_sample.yml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_6-baseline/300g_tok.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_6-baseline/300g_tok.yml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_6-baseline/tok_per_source.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_6-baseline/tok_per_source.sh -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_7-blocklist/300g_sample.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_7-blocklist/300g_sample.yml -------------------------------------------------------------------------------- /configs/dolma-v1_7/v1_7-blocklist/300g_tok.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma-v1_7/v1_7-blocklist/300g_tok.yml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/fashion_and_beauty.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/fashion_and_beauty.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/alex/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/alex/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/fashion_and_beauty.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/fashion_and_beauty.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/arithmetic/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/arithmetic/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/fashion_and_beauty.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/fashion_and_beauty.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/geometric/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/geometric/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/fashion_and_beauty.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/fashion_and_beauty.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/mayee/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/mayee/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/fashion_and_beauty.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/fashion_and_beauty.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy1/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy1/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/fashion_and_beauty.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/fashion_and_beauty.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/config/snazzy2/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/config/snazzy2/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/generate.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/runners/alex.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/runners/alex.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/runners/arithmetic.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/runners/arithmetic.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/runners/geometric.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/runners/geometric.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/runners/mayee.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/runners/mayee.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/runners/snazzy1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/runners/snazzy1.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/runners/snazzy2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/runners/snazzy2.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/alex.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/alex.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/alex.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/alex.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/arithmetic.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/arithmetic.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/arithmetic.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/arithmetic.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/geometric.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/geometric.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/geometric.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/geometric.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/mayee.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/mayee.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/mayee.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/mayee.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/snazzy1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/snazzy1.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/snazzy1.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/snazzy1.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/snazzy2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/snazzy2.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/all-dressed/vigintiles/snazzy2.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/all-dressed/vigintiles/snazzy2.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/dolma2-0625-v01.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/dolma2-0625-v01.csv -------------------------------------------------------------------------------- /configs/dolma2-resharding/dolma2-0625-v01.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/dolma2-0625-v01.xlsx -------------------------------------------------------------------------------- /configs/dolma2-resharding/dolma2-0625-v02.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/dolma2-0625-v02.csv -------------------------------------------------------------------------------- /configs/dolma2-resharding/dolma2-0625-v02.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/dolma2-0625-v02.xlsx -------------------------------------------------------------------------------- /configs/dolma2-resharding/final.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/final.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/agricultural-and-food-sciences.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/agricultural-and-food-sciences.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/art.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/art.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/biology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/biology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/chemistry.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/chemistry.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/computer-science.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/computer-science.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/economics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/economics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/education.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/education.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/engineering.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/engineering.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/environmental-science.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/environmental-science.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/geology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/geology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/history.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/history.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/materials-science.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/materials-science.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/mathematics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/mathematics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/medicine.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/medicine.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/philosophy.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/philosophy.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/physics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/physics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/political-science.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/political-science.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/config/psychology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/config/psychology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/full_pstar_7rep_dclm_stackedu_conditional.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/full_pstar_7rep_dclm_stackedu_conditional.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/generate.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/generate.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/generate.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2orc/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2orc/run.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/config/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/config/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/full_pstar_7rep_dclm_stackedu_conditional.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/full_pstar_7rep_dclm_stackedu_conditional.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/generate.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/generate.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/generate.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf-sep25/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf-sep25/run.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/adult_content.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/adult_content.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/art_and_design.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/art_and_design.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/crime_and_law.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/crime_and_law.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/education_and_jobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/education_and_jobs.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/electronics_and_hardware.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/electronics_and_hardware.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/entertainment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/entertainment.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/finance_and_business.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/finance_and_business.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/food_and_dining.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/food_and_dining.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/games.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/games.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/health.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/health.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/history_and_geography.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/history_and_geography.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/home_and_hobbies.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/home_and_hobbies.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/industrial.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/industrial.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/literature.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/literature.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/politics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/politics.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/religion.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/religion.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/science_math_and_technology.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/science_math_and_technology.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/social_life.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/social_life.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/software.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/software.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/software_development.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/software_development.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/sports_and_fitness.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/sports_and_fitness.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/transportation.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/transportation.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/config/travel_and_tourism.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/config/travel_and_tourism.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/full_pstar_7rep_dclm_stackedu_conditional.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/full_pstar_7rep_dclm_stackedu_conditional.json -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/generate.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/generate.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/generate.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/s2pdf/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/s2pdf/run.sh -------------------------------------------------------------------------------- /configs/dolma2-resharding/smaller/config/arxiv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/smaller/config/arxiv.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/smaller/config/finemath-3plus.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/smaller/config/finemath-3plus.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/smaller/config/wikipedia.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/smaller/config/wikipedia.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/smaller/generate.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/smaller/generate.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/smaller/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/smaller/generate.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/C.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/C.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/CSharp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/CSharp.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Cpp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Cpp.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Go.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Go.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Java.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Java.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/JavaScript.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/JavaScript.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Markdown.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Markdown.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/PHP.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/PHP.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Python.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Python.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Ruby.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Ruby.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Rust.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Rust.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/SQL.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/SQL.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Shell.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Shell.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/Swift.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/Swift.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/config/TypeScript.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/config/TypeScript.yaml -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/generate.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/generate.log -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/generate.py -------------------------------------------------------------------------------- /configs/dolma2-resharding/stack-edu/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/dolma2-resharding/stack-edu/run.sh -------------------------------------------------------------------------------- /configs/pes2o-dedup/pes2o_decontamination.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/pes2o-dedup/pes2o_decontamination.json -------------------------------------------------------------------------------- /configs/test/test_config_jq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/test/test_config_jq.yaml -------------------------------------------------------------------------------- /configs/test/test_config_jsonpath.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/test/test_config_jsonpath.yaml -------------------------------------------------------------------------------- /configs/test/test_filtered_mixer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/configs/test/test_filtered_mixer.yaml -------------------------------------------------------------------------------- /contrib/code-file-concat/.gitignore: -------------------------------------------------------------------------------- 1 | data/output 2 | -------------------------------------------------------------------------------- /contrib/code-file-concat/Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/code-file-concat/Cargo.lock -------------------------------------------------------------------------------- /contrib/code-file-concat/Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/code-file-concat/Cargo.toml -------------------------------------------------------------------------------- /contrib/code-file-concat/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/code-file-concat/README.md -------------------------------------------------------------------------------- /contrib/code-file-concat/src/concat.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/code-file-concat/src/concat.rs -------------------------------------------------------------------------------- /contrib/code-file-concat/src/main.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/code-file-concat/src/main.rs -------------------------------------------------------------------------------- /contrib/code-file-concat/tests/test_concat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/code-file-concat/tests/test_concat.py -------------------------------------------------------------------------------- /contrib/fill-in-middle/.gitignore: -------------------------------------------------------------------------------- 1 | data/output 2 | -------------------------------------------------------------------------------- /contrib/fill-in-middle/Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/fill-in-middle/Cargo.lock -------------------------------------------------------------------------------- /contrib/fill-in-middle/Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/fill-in-middle/Cargo.toml -------------------------------------------------------------------------------- /contrib/fill-in-middle/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/fill-in-middle/README.md -------------------------------------------------------------------------------- /contrib/fill-in-middle/src/fim.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/fill-in-middle/src/fim.rs -------------------------------------------------------------------------------- /contrib/fill-in-middle/src/main.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/fill-in-middle/src/main.rs -------------------------------------------------------------------------------- /contrib/fill-in-middle/tests/test_fim.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/fill-in-middle/tests/test_fim.py -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/.gitignore: -------------------------------------------------------------------------------- 1 | data/output 2 | -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/Cargo.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/tokens-sanitizer/Cargo.lock -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/Cargo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/tokens-sanitizer/Cargo.toml -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/tokens-sanitizer/README.md -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/data/input/f1.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/tokens-sanitizer/data/input/f1.jsonl -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/src/main.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/tokens-sanitizer/src/main.rs -------------------------------------------------------------------------------- /contrib/tokens-sanitizer/tests/test_sanitizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/contrib/tokens-sanitizer/tests/test_sanitizer.py -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/assets/AI2_Blog_1400x685.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/AI2_Blog_1400x685.png -------------------------------------------------------------------------------- /docs/assets/AI2_Blog_1400x685.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/AI2_Blog_1400x685.webp -------------------------------------------------------------------------------- /docs/assets/AI2_Blog_1400x685_2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/AI2_Blog_1400x685_2x.png -------------------------------------------------------------------------------- /docs/assets/AI2_Blog_1400x685_2x.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/AI2_Blog_1400x685_2x.webp -------------------------------------------------------------------------------- /docs/assets/DOLMA.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/DOLMA.webp -------------------------------------------------------------------------------- /docs/assets/DOLMA_2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/DOLMA_2x.png -------------------------------------------------------------------------------- /docs/assets/DOLMA_4x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/DOLMA_4x.png -------------------------------------------------------------------------------- /docs/assets/Small_655x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/Small_655x120.png -------------------------------------------------------------------------------- /docs/assets/Small_655x120_2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/Small_655x120_2x.png -------------------------------------------------------------------------------- /docs/assets/Square_1_600x600.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/Square_1_600x600.png -------------------------------------------------------------------------------- /docs/assets/Square_1_600x600_2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/Square_1_600x600_2x.png -------------------------------------------------------------------------------- /docs/assets/code-pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/code-pipeline.pdf -------------------------------------------------------------------------------- /docs/assets/code-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/code-pipeline.png -------------------------------------------------------------------------------- /docs/assets/diagram.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/diagram.webp -------------------------------------------------------------------------------- /docs/assets/dolma-v0_1-20230819.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/dolma-v0_1-20230819.pdf -------------------------------------------------------------------------------- /docs/assets/dolma-v1_6-20240131.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/dolma-v1_6-20240131.pdf -------------------------------------------------------------------------------- /docs/assets/web-pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/web-pipeline.pdf -------------------------------------------------------------------------------- /docs/assets/web-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/assets/web-pipeline.png -------------------------------------------------------------------------------- /docs/data-format.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/data-format.md -------------------------------------------------------------------------------- /docs/deduplication.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/deduplication.md -------------------------------------------------------------------------------- /docs/develop.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/develop.md -------------------------------------------------------------------------------- /docs/examples/dedupe-by-url.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/examples/dedupe-by-url.json -------------------------------------------------------------------------------- /docs/examples/dedupe-paragraphs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/examples/dedupe-paragraphs.json -------------------------------------------------------------------------------- /docs/examples/wikipedia-mixer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/examples/wikipedia-mixer.json -------------------------------------------------------------------------------- /docs/examples/wikipedia-mixer.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/examples/wikipedia-mixer.yaml -------------------------------------------------------------------------------- /docs/getting-started.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/getting-started.md -------------------------------------------------------------------------------- /docs/mixer.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/mixer.md -------------------------------------------------------------------------------- /docs/parallel-processor.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/parallel-processor.md -------------------------------------------------------------------------------- /docs/taggers.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/taggers.md -------------------------------------------------------------------------------- /docs/tokenize.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/docs/tokenize.md -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/pyproject.toml -------------------------------------------------------------------------------- /python/dolma/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/__init__.py -------------------------------------------------------------------------------- /python/dolma/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/__init__.py -------------------------------------------------------------------------------- /python/dolma/cli/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/__main__.py -------------------------------------------------------------------------------- /python/dolma/cli/analyzer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/analyzer.py -------------------------------------------------------------------------------- /python/dolma/cli/deduper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/deduper.py -------------------------------------------------------------------------------- /python/dolma/cli/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/main.py -------------------------------------------------------------------------------- /python/dolma/cli/mixer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/mixer.py -------------------------------------------------------------------------------- /python/dolma/cli/resolvers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/resolvers.py -------------------------------------------------------------------------------- /python/dolma/cli/shared.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/shared.py -------------------------------------------------------------------------------- /python/dolma/cli/tagger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/tagger.py -------------------------------------------------------------------------------- /python/dolma/cli/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/tokenizer.py -------------------------------------------------------------------------------- /python/dolma/cli/warc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/cli/warc.py -------------------------------------------------------------------------------- /python/dolma/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/__init__.py -------------------------------------------------------------------------------- /python/dolma/core/analyzer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/analyzer.py -------------------------------------------------------------------------------- /python/dolma/core/binning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/binning.py -------------------------------------------------------------------------------- /python/dolma/core/data_types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/data_types.py -------------------------------------------------------------------------------- /python/dolma/core/errors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/errors.py -------------------------------------------------------------------------------- /python/dolma/core/ft_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/ft_dataset.py -------------------------------------------------------------------------------- /python/dolma/core/ft_tagger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/ft_tagger.py -------------------------------------------------------------------------------- /python/dolma/core/loggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/loggers.py -------------------------------------------------------------------------------- /python/dolma/core/parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/parallel.py -------------------------------------------------------------------------------- /python/dolma/core/paths.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/paths.py -------------------------------------------------------------------------------- /python/dolma/core/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/registry.py -------------------------------------------------------------------------------- /python/dolma/core/runtime.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/runtime.py -------------------------------------------------------------------------------- /python/dolma/core/taggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/taggers.py -------------------------------------------------------------------------------- /python/dolma/core/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Code to train a Filter. 4 | 5 | @kylel 6 | 7 | """ 8 | -------------------------------------------------------------------------------- /python/dolma/core/url_blocker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/url_blocker.py -------------------------------------------------------------------------------- /python/dolma/core/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/utils.py -------------------------------------------------------------------------------- /python/dolma/core/vizualizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/core/vizualizer.py -------------------------------------------------------------------------------- /python/dolma/data/ext_to_lang_mapping.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/data/ext_to_lang_mapping.json -------------------------------------------------------------------------------- /python/dolma/data/naughty_words_en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/data/naughty_words_en.txt -------------------------------------------------------------------------------- /python/dolma/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/dolma/taggers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/__init__.py -------------------------------------------------------------------------------- /python/dolma/taggers/c4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/c4.py -------------------------------------------------------------------------------- /python/dolma/taggers/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/code/__init__.py -------------------------------------------------------------------------------- /python/dolma/taggers/code/code_taggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/code/code_taggers.py -------------------------------------------------------------------------------- /python/dolma/taggers/code/starcoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/code/starcoder.py -------------------------------------------------------------------------------- /python/dolma/taggers/code/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/code/utils.py -------------------------------------------------------------------------------- /python/dolma/taggers/code_composition.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/code_composition.py -------------------------------------------------------------------------------- /python/dolma/taggers/gopher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/gopher.py -------------------------------------------------------------------------------- /python/dolma/taggers/jigsaw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/jigsaw.py -------------------------------------------------------------------------------- /python/dolma/taggers/language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/language.py -------------------------------------------------------------------------------- /python/dolma/taggers/length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/length.py -------------------------------------------------------------------------------- /python/dolma/taggers/licenses.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/licenses.py -------------------------------------------------------------------------------- /python/dolma/taggers/pii.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/pii.py -------------------------------------------------------------------------------- /python/dolma/taggers/punctuation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/punctuation.py -------------------------------------------------------------------------------- /python/dolma/taggers/quality.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/quality.py -------------------------------------------------------------------------------- /python/dolma/taggers/repetitions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/repetitions/__init__.py -------------------------------------------------------------------------------- /python/dolma/taggers/repetitions/repetitions_taggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/repetitions/repetitions_taggers.py -------------------------------------------------------------------------------- /python/dolma/taggers/repetitions/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/repetitions/utils.py -------------------------------------------------------------------------------- /python/dolma/taggers/sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/sampling.py -------------------------------------------------------------------------------- /python/dolma/taggers/tokenizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/tokenizers.py -------------------------------------------------------------------------------- /python/dolma/taggers/url.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/taggers/url.py -------------------------------------------------------------------------------- /python/dolma/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/tokenizer/__init__.py -------------------------------------------------------------------------------- /python/dolma/tokenizer/data_types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/tokenizer/data_types.py -------------------------------------------------------------------------------- /python/dolma/tokenizer/executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/tokenizer/executor.py -------------------------------------------------------------------------------- /python/dolma/tokenizer/memmap_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/tokenizer/memmap_writer.py -------------------------------------------------------------------------------- /python/dolma/tokenizer/reshard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/tokenizer/reshard.py -------------------------------------------------------------------------------- /python/dolma/tokenizer/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/tokenizer/tokenizer.py -------------------------------------------------------------------------------- /python/dolma/warc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/warc/__init__.py -------------------------------------------------------------------------------- /python/dolma/warc/linearizers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/warc/linearizers.py -------------------------------------------------------------------------------- /python/dolma/warc/processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/warc/processor.py -------------------------------------------------------------------------------- /python/dolma/warc/record_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/warc/record_info.py -------------------------------------------------------------------------------- /python/dolma/warc/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/python/dolma/warc/utils.py -------------------------------------------------------------------------------- /scripts/attributes_heatmap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/attributes_heatmap.py -------------------------------------------------------------------------------- /scripts/code_reasoning_ablations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/code_reasoning_ablations.py -------------------------------------------------------------------------------- /scripts/code_reasoning_ablations_gsm8k_code.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/code_reasoning_ablations_gsm8k_code.jsonl -------------------------------------------------------------------------------- /scripts/dolma_paper_plots.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/dolma_paper_plots.sh -------------------------------------------------------------------------------- /scripts/dolma_single_digit_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/dolma_single_digit_tokenizer.py -------------------------------------------------------------------------------- /scripts/dolma_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/dolma_stats.py -------------------------------------------------------------------------------- /scripts/download_brave_domains.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/download_brave_domains.py -------------------------------------------------------------------------------- /scripts/download_cloudflare_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/download_cloudflare_urls.py -------------------------------------------------------------------------------- /scripts/download_hosts.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/download_hosts.ipynb -------------------------------------------------------------------------------- /scripts/download_phishing_domains.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/download_phishing_domains.py -------------------------------------------------------------------------------- /scripts/download_url_blocklist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/download_url_blocklist.py -------------------------------------------------------------------------------- /scripts/find_missing_attributes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/find_missing_attributes.py -------------------------------------------------------------------------------- /scripts/find_offset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/find_offset.py -------------------------------------------------------------------------------- /scripts/fix_dolma_v15_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/fix_dolma_v15_tokenizer.py -------------------------------------------------------------------------------- /scripts/hash_sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/hash_sample.py -------------------------------------------------------------------------------- /scripts/install_blingfire_macos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/install_blingfire_macos.py -------------------------------------------------------------------------------- /scripts/make_latex_fig_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/make_latex_fig_table.py -------------------------------------------------------------------------------- /scripts/make_olmo2_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/make_olmo2_tokenizer.py -------------------------------------------------------------------------------- /scripts/make_wikipedia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/make_wikipedia.py -------------------------------------------------------------------------------- /scripts/match_links_wiki.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/match_links_wiki.ipynb -------------------------------------------------------------------------------- /scripts/remove_empty_docs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/remove_empty_docs.py -------------------------------------------------------------------------------- /scripts/sample_prefix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/sample_prefix.py -------------------------------------------------------------------------------- /scripts/stack_correlation_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/stack_correlation_table.py -------------------------------------------------------------------------------- /scripts/stats_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/stats_urls.py -------------------------------------------------------------------------------- /scripts/tokenize_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/tokenize_eval.sh -------------------------------------------------------------------------------- /scripts/validate_mixer/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/README.md -------------------------------------------------------------------------------- /scripts/validate_mixer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/__init__.py -------------------------------------------------------------------------------- /scripts/validate_mixer/config_handler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/config_handler.py -------------------------------------------------------------------------------- /scripts/validate_mixer/env_handler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/env_handler.py -------------------------------------------------------------------------------- /scripts/validate_mixer/file_operations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/file_operations.py -------------------------------------------------------------------------------- /scripts/validate_mixer/filter_operations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/filter_operations.py -------------------------------------------------------------------------------- /scripts/validate_mixer/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/main.py -------------------------------------------------------------------------------- /scripts/validate_mixer/s3_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/s3_utils.py -------------------------------------------------------------------------------- /scripts/validate_mixer/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/utils.py -------------------------------------------------------------------------------- /scripts/validate_mixer/validator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/validate_mixer/validator.py -------------------------------------------------------------------------------- /scripts/wandb_run_vocab.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/wandb_run_vocab.yaml -------------------------------------------------------------------------------- /scripts/wandb_to_plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/wandb_to_plot.py -------------------------------------------------------------------------------- /scripts/wimbd_to_dolma.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/scripts/wimbd_to_dolma.py -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/setup.sh -------------------------------------------------------------------------------- /sources/cc_warc/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/cc_warc/README.md -------------------------------------------------------------------------------- /sources/reddit/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/README.md -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/build_comment_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v3/build_comment_data.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/build_submission_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v3/build_submission_data.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | jsonlines 3 | -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v3/setup.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/subreddit_blocklist.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v3/subreddit_blocklist.txt -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v3/utils/shared_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v3/utils/shared_utils.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/build_comment_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v5/build_comment_data.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/build_submission_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v5/build_submission_data.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | jsonlines 3 | -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v5/setup.py -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/subreddit_blocklist.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v5/subreddit_blocklist.txt -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sources/reddit/atomic_content_v5/utils/shared_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/atomic_content_v5/utils/shared_utils.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/build_comment_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v1/build_comment_data.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/build_submission_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v1/build_submission_data.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | jsonlines 3 | datasets -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v1/setup.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/subreddit_blocklist.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v1/subreddit_blocklist.txt -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v1/utils/shared_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v1/utils/shared_utils.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/build_comment_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v2/build_comment_data.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/build_submission_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v2/build_submission_data.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | jsonlines 3 | datasets -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v2/setup.py -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/subreddit_blocklist.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v2/subreddit_blocklist.txt -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sources/reddit/comment_threads_v2/utils/shared_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/comment_threads_v2/utils/shared_utils.py -------------------------------------------------------------------------------- /sources/reddit/complete_threads_codelike_v4/build_combined_thread_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/complete_threads_codelike_v4/build_combined_thread_data.py -------------------------------------------------------------------------------- /sources/reddit/complete_threads_codelike_v4/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | jsonlines 3 | datasets -------------------------------------------------------------------------------- /sources/reddit/complete_threads_codelike_v4/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/complete_threads_codelike_v4/setup.py -------------------------------------------------------------------------------- /sources/reddit/complete_threads_codelike_v4/subreddit_blocklist.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/complete_threads_codelike_v4/subreddit_blocklist.txt -------------------------------------------------------------------------------- /sources/reddit/complete_threads_codelike_v4/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sources/reddit/complete_threads_codelike_v4/utils/shared_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/reddit/complete_threads_codelike_v4/utils/shared_utils.py -------------------------------------------------------------------------------- /sources/starcoder/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/starcoder/README.md -------------------------------------------------------------------------------- /sources/starcoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sources/starcoder/requirements.txt: -------------------------------------------------------------------------------- 1 | pyarrow 2 | -------------------------------------------------------------------------------- /sources/starcoder/v0.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/sources/starcoder/v0.py -------------------------------------------------------------------------------- /src/bloom_filter.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/bloom_filter.rs -------------------------------------------------------------------------------- /src/bloom_filter/bloom_test.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/bloom_filter/bloom_test.rs -------------------------------------------------------------------------------- /src/deduper.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/deduper.rs -------------------------------------------------------------------------------- /src/filters.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/filters.rs -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/io.rs -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/lib.rs -------------------------------------------------------------------------------- /src/mixer.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/mixer.rs -------------------------------------------------------------------------------- /src/s3_util.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/s3_util.rs -------------------------------------------------------------------------------- /src/shard.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/shard.rs -------------------------------------------------------------------------------- /src/wimbd/io.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/io.rs -------------------------------------------------------------------------------- /src/wimbd/mod.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/mod.rs -------------------------------------------------------------------------------- /src/wimbd/ngrams/counter.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/ngrams/counter.rs -------------------------------------------------------------------------------- /src/wimbd/ngrams/mod.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/ngrams/mod.rs -------------------------------------------------------------------------------- /src/wimbd/ngrams/topk.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/ngrams/topk.rs -------------------------------------------------------------------------------- /src/wimbd/progress.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/progress.rs -------------------------------------------------------------------------------- /src/wimbd/tokens.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/tokens.rs -------------------------------------------------------------------------------- /src/wimbd/util.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/src/wimbd/util.rs -------------------------------------------------------------------------------- /tests/config/c4-cleaned.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/c4-cleaned.json -------------------------------------------------------------------------------- /tests/config/dedupe-by-url.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/dedupe-by-url.json -------------------------------------------------------------------------------- /tests/config/dedupe-paragraph-ngrams.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/dedupe-paragraph-ngrams.json -------------------------------------------------------------------------------- /tests/config/dedupe-paragraphs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/dedupe-paragraphs.json -------------------------------------------------------------------------------- /tests/config/email-spans-jq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/email-spans-jq.yaml -------------------------------------------------------------------------------- /tests/config/email-spans.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/email-spans.json -------------------------------------------------------------------------------- /tests/config/filepath-bad.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/filepath-bad.json -------------------------------------------------------------------------------- /tests/config/filepath-good.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/filepath-good.json -------------------------------------------------------------------------------- /tests/config/filter-by-spans.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/filter-by-spans.json -------------------------------------------------------------------------------- /tests/config/mixer-validator-jq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/mixer-validator-jq.yaml -------------------------------------------------------------------------------- /tests/config/mixer-validator-jsonpath.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/mixer-validator-jsonpath.yaml -------------------------------------------------------------------------------- /tests/config/mixer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/mixer.json -------------------------------------------------------------------------------- /tests/config/paragraph-spans.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/config/paragraph-spans.json -------------------------------------------------------------------------------- /tests/data/expected/dedupe-by-url.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/dedupe-by-url.json.gz -------------------------------------------------------------------------------- /tests/data/expected/dedupe-paragraph-ngrams.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/dedupe-paragraph-ngrams.json.gz -------------------------------------------------------------------------------- /tests/data/expected/dedupe-paragraphs.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/dedupe-paragraphs.json.gz -------------------------------------------------------------------------------- /tests/data/expected/email-spans-jq.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/email-spans-jq.json.gz -------------------------------------------------------------------------------- /tests/data/expected/email-spans.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/email-spans.json.gz -------------------------------------------------------------------------------- /tests/data/expected/filter-by-spans.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/filter-by-spans.json.gz -------------------------------------------------------------------------------- /tests/data/expected/mixer.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/mixer.json.gz -------------------------------------------------------------------------------- /tests/data/expected/remove-paragraphs.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/expected/remove-paragraphs.json.gz -------------------------------------------------------------------------------- /tests/data/formats/test.jsonl: -------------------------------------------------------------------------------- 1 | {"message": "this is a test"} -------------------------------------------------------------------------------- /tests/data/formats/test.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/formats/test.jsonl.gz -------------------------------------------------------------------------------- /tests/data/formats/test.jsonl.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/formats/test.jsonl.zst -------------------------------------------------------------------------------- /tests/data/multiple_files/cc_en_head-0091.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/multiple_files/cc_en_head-0091.jsonl -------------------------------------------------------------------------------- /tests/data/multiple_files/cc_en_head-0091.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/multiple_files/cc_en_head-0091.jsonl.gz -------------------------------------------------------------------------------- /tests/data/multiple_files/cc_en_head-0174.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/multiple_files/cc_en_head-0174.jsonl -------------------------------------------------------------------------------- /tests/data/multiple_files/cc_en_head-0174.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/multiple_files/cc_en_head-0174.jsonl.gz -------------------------------------------------------------------------------- /tests/data/provided/attributes/duplicate_paragraphs/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/attributes/duplicate_paragraphs/000.json.gz -------------------------------------------------------------------------------- /tests/data/provided/attributes/pii/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/attributes/pii/000.json.gz -------------------------------------------------------------------------------- /tests/data/provided/attributes/sample/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/attributes/sample/000.json.gz -------------------------------------------------------------------------------- /tests/data/provided/attributes/toxicity/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/attributes/toxicity/000.json.gz -------------------------------------------------------------------------------- /tests/data/provided/deduper/documents/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/deduper/documents/000.json.gz -------------------------------------------------------------------------------- /tests/data/provided/deduper/pathnotd0cumentz/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/deduper/pathnotd0cumentz/000.json.gz -------------------------------------------------------------------------------- /tests/data/provided/documents/000.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/provided/documents/000.json.gz -------------------------------------------------------------------------------- /tests/data/tokenizer/dolma2-test-tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/tokenizer/dolma2-test-tokenizer.json -------------------------------------------------------------------------------- /tests/data/tokenizer/gpt-neo-test-tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/tokenizer/gpt-neo-test-tokenizer.json -------------------------------------------------------------------------------- /tests/data/tokenizer/llama-test-tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/tokenizer/llama-test-tokenizer.json -------------------------------------------------------------------------------- /tests/data/tokenizer/llama3-test-tokenizer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/tokenizer/llama3-test-tokenizer.json -------------------------------------------------------------------------------- /tests/data/urls/easylist.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/urls/easylist.txt.gz -------------------------------------------------------------------------------- /tests/data/warc/sample-0000.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/warc/sample-0000.warc.gz -------------------------------------------------------------------------------- /tests/data/warc/sample-0001.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/data/warc/sample-0001.warc.gz -------------------------------------------------------------------------------- /tests/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/__init__.py -------------------------------------------------------------------------------- /tests/python/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/conftest.py -------------------------------------------------------------------------------- /tests/python/extras/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/python/extras/extras_from_module/__init__.py: -------------------------------------------------------------------------------- 1 | from .extra_taggers import * # noqa 2 | -------------------------------------------------------------------------------- /tests/python/extras/extras_from_module/extra_taggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/extras/extras_from_module/extra_taggers.py -------------------------------------------------------------------------------- /tests/python/extras/extras_from_module_path/__init__.py: -------------------------------------------------------------------------------- 1 | from .extra_taggers import * # noqa 2 | -------------------------------------------------------------------------------- /tests/python/extras/extras_from_module_path/extra_taggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/extras/extras_from_module_path/extra_taggers.py -------------------------------------------------------------------------------- /tests/python/extras/extras_from_path/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/python/extras/extras_from_path/extra_taggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/extras/extras_from_path/extra_taggers.py -------------------------------------------------------------------------------- /tests/python/extras/useful_extra/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/extras/useful_extra/__init__.py -------------------------------------------------------------------------------- /tests/python/test_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_analysis.py -------------------------------------------------------------------------------- /tests/python/test_binning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_binning.py -------------------------------------------------------------------------------- /tests/python/test_c4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_c4.py -------------------------------------------------------------------------------- /tests/python/test_code.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_code.py -------------------------------------------------------------------------------- /tests/python/test_code_composition.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_code_composition.py -------------------------------------------------------------------------------- /tests/python/test_data_types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_data_types.py -------------------------------------------------------------------------------- /tests/python/test_deduper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_deduper.py -------------------------------------------------------------------------------- /tests/python/test_extra.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_extra.py -------------------------------------------------------------------------------- /tests/python/test_gopher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_gopher.py -------------------------------------------------------------------------------- /tests/python/test_language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_language.py -------------------------------------------------------------------------------- /tests/python/test_length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_length.py -------------------------------------------------------------------------------- /tests/python/test_license.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_license.py -------------------------------------------------------------------------------- /tests/python/test_mixer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_mixer.py -------------------------------------------------------------------------------- /tests/python/test_nested_struct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_nested_struct.py -------------------------------------------------------------------------------- /tests/python/test_omegaconf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_omegaconf.py -------------------------------------------------------------------------------- /tests/python/test_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_parallel.py -------------------------------------------------------------------------------- /tests/python/test_paths.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_paths.py -------------------------------------------------------------------------------- /tests/python/test_quality.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_quality.py -------------------------------------------------------------------------------- /tests/python/test_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_registry.py -------------------------------------------------------------------------------- /tests/python/test_repetitions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_repetitions.py -------------------------------------------------------------------------------- /tests/python/test_repetitions_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_repetitions_utils.py -------------------------------------------------------------------------------- /tests/python/test_resharding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_resharding.py -------------------------------------------------------------------------------- /tests/python/test_runtime.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_runtime.py -------------------------------------------------------------------------------- /tests/python/test_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_tokenizer.py -------------------------------------------------------------------------------- /tests/python/test_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_urls.py -------------------------------------------------------------------------------- /tests/python/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_utils.py -------------------------------------------------------------------------------- /tests/python/test_warc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_warc.py -------------------------------------------------------------------------------- /tests/python/test_warc_record_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/test_warc_record_info.py -------------------------------------------------------------------------------- /tests/python/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allenai/dolma/HEAD/tests/python/utils.py --------------------------------------------------------------------------------