├── .github └── workflows │ ├── ci.yml │ ├── docs.yml │ └── publish_pypi.yml ├── .gitignore ├── .pypirc ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── Makefile ├── README.md ├── docs ├── README.md ├── add-your-own-data.md ├── api │ ├── base_dataset.md │ ├── config.md │ ├── hf_dataset.md │ └── jsonl_dataset.md ├── compose-train-validation-data.md ├── config-files.md ├── datasets │ ├── index.md │ ├── language_af.md │ ├── language_am.md │ ├── language_an.md │ ├── language_ar.md │ ├── language_arz.md │ ├── language_as.md │ ├── language_ast.md │ ├── language_av.md │ ├── language_az.md │ ├── language_azb.md │ ├── language_ba.md │ ├── language_be.md │ ├── language_bg.md │ ├── language_bh.md │ ├── language_bn.md │ ├── language_bo.md │ ├── language_bpy.md │ ├── language_br.md │ ├── language_bs.md │ ├── language_bxr.md │ ├── language_ca.md │ ├── language_ce.md │ ├── language_ceb.md │ ├── language_ckb.md │ ├── language_code.md │ ├── language_cs.md │ ├── language_cv.md │ ├── language_cy.md │ ├── language_da.md │ ├── language_de.md │ ├── language_dsb.md │ ├── language_dv.md │ ├── language_el.md │ ├── language_en.md │ ├── language_eo.md │ ├── language_es.md │ ├── language_et.md │ ├── language_eu.md │ ├── language_fa.md │ ├── language_fi.md │ ├── language_fr.md │ ├── language_fy.md │ ├── language_ga.md │ ├── language_gd.md │ ├── language_gl.md │ ├── language_gn.md │ ├── language_gom.md │ ├── language_gsw.md │ ├── language_gu.md │ ├── language_ha.md │ ├── language_he.md │ ├── language_hi.md │ ├── language_hr.md │ ├── language_hsb.md │ ├── language_ht.md │ ├── language_hu.md │ ├── language_hy.md │ ├── language_ia.md │ ├── language_id.md │ ├── language_ie.md │ ├── language_ig.md │ ├── language_ilo.md │ ├── language_io.md │ ├── language_is.md │ ├── language_it.md │ ├── language_ja.md │ ├── language_jbo.md │ ├── language_jv.md │ ├── language_ka.md │ ├── language_kk.md │ ├── language_km.md │ ├── language_kn.md │ ├── language_ko.md │ ├── language_krc.md │ ├── language_ku.md │ ├── language_kv.md │ ├── language_kw.md │ ├── language_ky.md │ ├── language_la.md │ ├── language_lb.md │ ├── language_lez.md │ ├── language_li.md │ ├── language_lmo.md │ ├── language_lo.md │ ├── language_lt.md │ ├── language_lv.md │ ├── language_mai.md │ ├── language_mg.md │ ├── language_mhr.md │ ├── language_min.md │ ├── language_mk.md │ ├── language_ml.md │ ├── language_mn.md │ ├── language_mr.md │ ├── language_mrj.md │ ├── language_ms.md │ ├── language_mt.md │ ├── language_multi.md │ ├── language_mwl.md │ ├── language_my.md │ ├── language_mzn.md │ ├── language_nah.md │ ├── language_nds.md │ ├── language_ne.md │ ├── language_new.md │ ├── language_nl.md │ ├── language_nn.md │ ├── language_no.md │ ├── language_ny.md │ ├── language_oc.md │ ├── language_om.md │ ├── language_or.md │ ├── language_os.md │ ├── language_pa.md │ ├── language_pl.md │ ├── language_pms.md │ ├── language_pnb.md │ ├── language_ps.md │ ├── language_pt.md │ ├── language_qu.md │ ├── language_ro.md │ ├── language_ru.md │ ├── language_rw.md │ ├── language_sa.md │ ├── language_sah.md │ ├── language_sd.md │ ├── language_sh.md │ ├── language_si.md │ ├── language_sk.md │ ├── language_sl.md │ ├── language_sn.md │ ├── language_so.md │ ├── language_sq.md │ ├── language_sr.md │ ├── language_st.md │ ├── language_su.md │ ├── language_sv.md │ ├── language_sw.md │ ├── language_ta.md │ ├── language_te.md │ ├── language_tg.md │ ├── language_th.md │ ├── language_ti.md │ ├── language_tk.md │ ├── language_tl.md │ ├── language_tr.md │ ├── language_tt.md │ ├── language_ug.md │ ├── language_uk.md │ ├── language_ur.md │ ├── language_uz.md │ ├── language_vi.md │ ├── language_vo.md │ ├── language_wa.md │ ├── language_war.md │ ├── language_wuu.md │ ├── language_x-eml.md │ ├── language_xal.md │ ├── language_xh.md │ ├── language_xmf.md │ ├── language_yi.md │ ├── language_yo.md │ ├── language_zh.md │ ├── language_zu.md │ ├── tokens_by_language.png │ └── tokens_by_source.png ├── extract-text-data.md ├── getting-started.md ├── images │ ├── A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png │ ├── data-schema.svg │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon.ico │ └── pipeline.svg ├── index.md ├── integration-with-other-frameworks.md ├── overview.md └── related-work.md ├── examples ├── custom_datasets │ ├── README.md │ └── my_datasets │ │ ├── __init__.py │ │ ├── csv_example.py │ │ ├── dataset_registry.py │ │ └── pg19.py └── lm_datasets_configs │ ├── README.md │ ├── italian_data.yaml │ └── my_system.yaml ├── mkdocs.yml ├── pyproject.toml ├── requirements.txt ├── requirements ├── base.txt ├── datasets.txt ├── datatrove.txt ├── dev.txt ├── docs.txt ├── megatron.txt └── viewer.txt ├── setup.py ├── src └── llm_datasets │ ├── __init__.py │ ├── __main__.py │ ├── chunkify_datasets.py │ ├── collect_metrics.py │ ├── commands │ ├── __init__.py │ ├── chunkify_command.py │ ├── collect_metrics_command.py │ ├── compose_command.py │ ├── convert_parquet_to_jsonl_command.py │ ├── exact_dedup_command.py │ ├── extract_text_command.py │ ├── hf_upload_command.py │ ├── print_stats_command.py │ ├── render_docs_command.py │ ├── shuffle_command.py │ └── train_tokenizer_command.py │ ├── compose_dataset.py │ ├── convert_parquet_to_jsonl.py │ ├── datasets │ ├── __init__.py │ ├── base.py │ ├── bg │ │ └── __init__.py │ ├── code │ │ ├── __init__.py │ │ └── starcoder.py │ ├── cs │ │ ├── __init__.py │ │ ├── cs_en_parallel.py │ │ └── syn_v9.py │ ├── da │ │ ├── __init__.py │ │ ├── danewsroom.py │ │ ├── danish_gigaword.py │ │ ├── danish_parliament_corpus.py │ │ └── dk_clarin.py │ ├── dataset_registry.py │ ├── de │ │ ├── __init__.py │ │ ├── de_laws.py │ │ ├── dewac.py │ │ └── openlegaldata.py │ ├── el │ │ ├── __init__.py │ │ ├── greek_legal_code.py │ │ └── greek_web_corpus.py │ ├── en │ │ ├── __init__.py │ │ ├── dialogstudio.py │ │ ├── edgar.py │ │ ├── math_amps.py │ │ ├── pes2o.py │ │ ├── pile_of_law.py │ │ ├── proof_pile.py │ │ └── wikihow.py │ ├── es │ │ ├── __init__.py │ │ ├── escorpius.py │ │ └── spanish_legal.py │ ├── et │ │ ├── __init__.py │ │ ├── ekspress.py │ │ ├── enc.py │ │ └── estonian_reference_corpus.py │ ├── eu │ │ ├── __init__.py │ │ └── euscrawl.py │ ├── fi │ │ ├── __init__.py │ │ └── ylenews.py │ ├── fr │ │ ├── __init__.py │ │ ├── cabernet.py │ │ ├── pleiasbooks.py │ │ └── pleiasnews.py │ ├── ga │ │ ├── __init__.py │ │ ├── ga_bilingual_legistation.py │ │ └── ga_universal_dependencies.py │ ├── hf_dataset.py │ ├── hr │ │ ├── __init__.py │ │ ├── croatian_news_engri.py │ │ ├── hrwac.py │ │ └── styria_news.py │ ├── it │ │ ├── __init__.py │ │ ├── gazzetta_ufficiale.py │ │ ├── itwac.py │ │ └── paisa.py │ ├── jsonl_dataset.py │ ├── lt │ │ └── seimas_lt_en.py │ ├── lv │ │ ├── __init__.py │ │ └── state_related_latvian_web.py │ ├── mt │ │ ├── __init__.py │ │ └── korpus_malti.py │ ├── multilingual │ │ ├── __init__.py │ │ ├── colossal_oscar.py │ │ ├── curlicat.py │ │ ├── eurlex.py │ │ ├── legal_mc4.py │ │ ├── macocu.py │ │ ├── redpajama.py │ │ ├── wikimedia.py │ │ └── wura.py │ ├── nl │ │ ├── __init__.py │ │ ├── sonar.py │ │ └── sonar_new_media.py │ ├── no │ │ ├── __init__.py │ │ ├── maalfrid_2021.py │ │ ├── nak.py │ │ ├── nbdigital.py │ │ ├── norwegian_cc.py │ │ ├── parlamint.py │ │ ├── parliamentary_proceedings.py │ │ └── sakspapir_nno.py │ ├── parquet_dataset.py │ ├── pl │ │ ├── luna_pl.py │ │ ├── pl_nkjp.py │ │ └── pl_parliamentary_corpus.py │ ├── pt │ │ ├── brwac.py │ │ └── parlamento_pt.py │ ├── ro │ │ ├── __init__.py │ │ └── marcell_legislative_subcorpus_v2.py │ ├── sk │ │ ├── __init__.py │ │ ├── sk_court_decisions.py │ │ └── sk_laws.py │ ├── sl │ │ ├── __init__.py │ │ ├── academic_slovene_kas.py │ │ ├── cc_gigafida.py │ │ └── slwac_web.py │ ├── sr │ │ ├── __init__.py │ │ └── srpkor.py │ ├── sv │ │ ├── __init__.py │ │ └── sv_gigaword.py │ └── uk │ │ ├── __init__.py │ │ ├── ubertext_2.py │ │ └── uk_laws.py │ ├── datatrove_reader.py │ ├── dedup │ └── __init__.py │ ├── extract_text.py │ ├── hf_tokenize_parquet_dataset.py │ ├── io │ ├── __init__.py │ ├── conllu_file.py │ ├── parquet.py │ └── prevert_file.py │ ├── megatron_tokenize_parquet_dataset.py │ ├── print_stats.py │ ├── shuffle_datasets.py │ ├── train_sp_tokenizer.py │ ├── utils │ ├── __init__.py │ ├── config.py │ ├── dataframe.py │ ├── dataset_generator.py │ ├── docs │ │ ├── __init__.py │ │ ├── plots.py │ │ └── tables.py │ ├── flatmap.py │ ├── languages.py │ ├── settings.py │ ├── shuffle_big_file.py │ ├── systems.py │ └── wikimedia.py │ └── viewer │ ├── app.py │ ├── ngrok-app.py │ └── viewer_utils.py └── tests ├── __init__.py ├── conftest.py ├── dummy_datasets.py ├── fixtures ├── configs │ └── dummy_config.yml └── oscar_2301_texts_and_hashes.json ├── test_compose_dataset_benchmark.py ├── test_config.py ├── test_datatrove_reader.py ├── test_document_datasets.py ├── test_generate_texts_from_output.py ├── test_interleave_datasets.py ├── test_iterate_over_shuffled_datasets.py ├── test_split_dataset.py ├── test_tlsh_hashes.py ├── test_write_parquet.py └── test_write_parquet_chunks.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.github/workflows/ci.yml -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.github/workflows/docs.yml -------------------------------------------------------------------------------- /.github/workflows/publish_pypi.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.github/workflows/publish_pypi.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.gitignore -------------------------------------------------------------------------------- /.pypirc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.pypirc -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.vscode/launch.json -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/.vscode/settings.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/README.md -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/add-your-own-data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/add-your-own-data.md -------------------------------------------------------------------------------- /docs/api/base_dataset.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/api/base_dataset.md -------------------------------------------------------------------------------- /docs/api/config.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/api/config.md -------------------------------------------------------------------------------- /docs/api/hf_dataset.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/api/hf_dataset.md -------------------------------------------------------------------------------- /docs/api/jsonl_dataset.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/api/jsonl_dataset.md -------------------------------------------------------------------------------- /docs/compose-train-validation-data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/compose-train-validation-data.md -------------------------------------------------------------------------------- /docs/config-files.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/config-files.md -------------------------------------------------------------------------------- /docs/datasets/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/index.md -------------------------------------------------------------------------------- /docs/datasets/language_af.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_af.md -------------------------------------------------------------------------------- /docs/datasets/language_am.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_am.md -------------------------------------------------------------------------------- /docs/datasets/language_an.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_an.md -------------------------------------------------------------------------------- /docs/datasets/language_ar.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ar.md -------------------------------------------------------------------------------- /docs/datasets/language_arz.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_arz.md -------------------------------------------------------------------------------- /docs/datasets/language_as.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_as.md -------------------------------------------------------------------------------- /docs/datasets/language_ast.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ast.md -------------------------------------------------------------------------------- /docs/datasets/language_av.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_av.md -------------------------------------------------------------------------------- /docs/datasets/language_az.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_az.md -------------------------------------------------------------------------------- /docs/datasets/language_azb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_azb.md -------------------------------------------------------------------------------- /docs/datasets/language_ba.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ba.md -------------------------------------------------------------------------------- /docs/datasets/language_be.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_be.md -------------------------------------------------------------------------------- /docs/datasets/language_bg.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bg.md -------------------------------------------------------------------------------- /docs/datasets/language_bh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bh.md -------------------------------------------------------------------------------- /docs/datasets/language_bn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bn.md -------------------------------------------------------------------------------- /docs/datasets/language_bo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bo.md -------------------------------------------------------------------------------- /docs/datasets/language_bpy.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bpy.md -------------------------------------------------------------------------------- /docs/datasets/language_br.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_br.md -------------------------------------------------------------------------------- /docs/datasets/language_bs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bs.md -------------------------------------------------------------------------------- /docs/datasets/language_bxr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_bxr.md -------------------------------------------------------------------------------- /docs/datasets/language_ca.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ca.md -------------------------------------------------------------------------------- /docs/datasets/language_ce.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ce.md -------------------------------------------------------------------------------- /docs/datasets/language_ceb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ceb.md -------------------------------------------------------------------------------- /docs/datasets/language_ckb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ckb.md -------------------------------------------------------------------------------- /docs/datasets/language_code.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_code.md -------------------------------------------------------------------------------- /docs/datasets/language_cs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_cs.md -------------------------------------------------------------------------------- /docs/datasets/language_cv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_cv.md -------------------------------------------------------------------------------- /docs/datasets/language_cy.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_cy.md -------------------------------------------------------------------------------- /docs/datasets/language_da.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_da.md -------------------------------------------------------------------------------- /docs/datasets/language_de.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_de.md -------------------------------------------------------------------------------- /docs/datasets/language_dsb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_dsb.md -------------------------------------------------------------------------------- /docs/datasets/language_dv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_dv.md -------------------------------------------------------------------------------- /docs/datasets/language_el.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_el.md -------------------------------------------------------------------------------- /docs/datasets/language_en.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_en.md -------------------------------------------------------------------------------- /docs/datasets/language_eo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_eo.md -------------------------------------------------------------------------------- /docs/datasets/language_es.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_es.md -------------------------------------------------------------------------------- /docs/datasets/language_et.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_et.md -------------------------------------------------------------------------------- /docs/datasets/language_eu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_eu.md -------------------------------------------------------------------------------- /docs/datasets/language_fa.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_fa.md -------------------------------------------------------------------------------- /docs/datasets/language_fi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_fi.md -------------------------------------------------------------------------------- /docs/datasets/language_fr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_fr.md -------------------------------------------------------------------------------- /docs/datasets/language_fy.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_fy.md -------------------------------------------------------------------------------- /docs/datasets/language_ga.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ga.md -------------------------------------------------------------------------------- /docs/datasets/language_gd.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_gd.md -------------------------------------------------------------------------------- /docs/datasets/language_gl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_gl.md -------------------------------------------------------------------------------- /docs/datasets/language_gn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_gn.md -------------------------------------------------------------------------------- /docs/datasets/language_gom.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_gom.md -------------------------------------------------------------------------------- /docs/datasets/language_gsw.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_gsw.md -------------------------------------------------------------------------------- /docs/datasets/language_gu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_gu.md -------------------------------------------------------------------------------- /docs/datasets/language_ha.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ha.md -------------------------------------------------------------------------------- /docs/datasets/language_he.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_he.md -------------------------------------------------------------------------------- /docs/datasets/language_hi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_hi.md -------------------------------------------------------------------------------- /docs/datasets/language_hr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_hr.md -------------------------------------------------------------------------------- /docs/datasets/language_hsb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_hsb.md -------------------------------------------------------------------------------- /docs/datasets/language_ht.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ht.md -------------------------------------------------------------------------------- /docs/datasets/language_hu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_hu.md -------------------------------------------------------------------------------- /docs/datasets/language_hy.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_hy.md -------------------------------------------------------------------------------- /docs/datasets/language_ia.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ia.md -------------------------------------------------------------------------------- /docs/datasets/language_id.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_id.md -------------------------------------------------------------------------------- /docs/datasets/language_ie.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ie.md -------------------------------------------------------------------------------- /docs/datasets/language_ig.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ig.md -------------------------------------------------------------------------------- /docs/datasets/language_ilo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ilo.md -------------------------------------------------------------------------------- /docs/datasets/language_io.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_io.md -------------------------------------------------------------------------------- /docs/datasets/language_is.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_is.md -------------------------------------------------------------------------------- /docs/datasets/language_it.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_it.md -------------------------------------------------------------------------------- /docs/datasets/language_ja.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ja.md -------------------------------------------------------------------------------- /docs/datasets/language_jbo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_jbo.md -------------------------------------------------------------------------------- /docs/datasets/language_jv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_jv.md -------------------------------------------------------------------------------- /docs/datasets/language_ka.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ka.md -------------------------------------------------------------------------------- /docs/datasets/language_kk.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_kk.md -------------------------------------------------------------------------------- /docs/datasets/language_km.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_km.md -------------------------------------------------------------------------------- /docs/datasets/language_kn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_kn.md -------------------------------------------------------------------------------- /docs/datasets/language_ko.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ko.md -------------------------------------------------------------------------------- /docs/datasets/language_krc.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_krc.md -------------------------------------------------------------------------------- /docs/datasets/language_ku.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ku.md -------------------------------------------------------------------------------- /docs/datasets/language_kv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_kv.md -------------------------------------------------------------------------------- /docs/datasets/language_kw.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_kw.md -------------------------------------------------------------------------------- /docs/datasets/language_ky.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ky.md -------------------------------------------------------------------------------- /docs/datasets/language_la.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_la.md -------------------------------------------------------------------------------- /docs/datasets/language_lb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_lb.md -------------------------------------------------------------------------------- /docs/datasets/language_lez.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_lez.md -------------------------------------------------------------------------------- /docs/datasets/language_li.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_li.md -------------------------------------------------------------------------------- /docs/datasets/language_lmo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_lmo.md -------------------------------------------------------------------------------- /docs/datasets/language_lo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_lo.md -------------------------------------------------------------------------------- /docs/datasets/language_lt.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_lt.md -------------------------------------------------------------------------------- /docs/datasets/language_lv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_lv.md -------------------------------------------------------------------------------- /docs/datasets/language_mai.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mai.md -------------------------------------------------------------------------------- /docs/datasets/language_mg.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mg.md -------------------------------------------------------------------------------- /docs/datasets/language_mhr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mhr.md -------------------------------------------------------------------------------- /docs/datasets/language_min.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_min.md -------------------------------------------------------------------------------- /docs/datasets/language_mk.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mk.md -------------------------------------------------------------------------------- /docs/datasets/language_ml.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ml.md -------------------------------------------------------------------------------- /docs/datasets/language_mn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mn.md -------------------------------------------------------------------------------- /docs/datasets/language_mr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mr.md -------------------------------------------------------------------------------- /docs/datasets/language_mrj.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mrj.md -------------------------------------------------------------------------------- /docs/datasets/language_ms.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ms.md -------------------------------------------------------------------------------- /docs/datasets/language_mt.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mt.md -------------------------------------------------------------------------------- /docs/datasets/language_multi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_multi.md -------------------------------------------------------------------------------- /docs/datasets/language_mwl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mwl.md -------------------------------------------------------------------------------- /docs/datasets/language_my.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_my.md -------------------------------------------------------------------------------- /docs/datasets/language_mzn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_mzn.md -------------------------------------------------------------------------------- /docs/datasets/language_nah.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_nah.md -------------------------------------------------------------------------------- /docs/datasets/language_nds.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_nds.md -------------------------------------------------------------------------------- /docs/datasets/language_ne.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ne.md -------------------------------------------------------------------------------- /docs/datasets/language_new.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_new.md -------------------------------------------------------------------------------- /docs/datasets/language_nl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_nl.md -------------------------------------------------------------------------------- /docs/datasets/language_nn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_nn.md -------------------------------------------------------------------------------- /docs/datasets/language_no.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_no.md -------------------------------------------------------------------------------- /docs/datasets/language_ny.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ny.md -------------------------------------------------------------------------------- /docs/datasets/language_oc.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_oc.md -------------------------------------------------------------------------------- /docs/datasets/language_om.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_om.md -------------------------------------------------------------------------------- /docs/datasets/language_or.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_or.md -------------------------------------------------------------------------------- /docs/datasets/language_os.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_os.md -------------------------------------------------------------------------------- /docs/datasets/language_pa.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_pa.md -------------------------------------------------------------------------------- /docs/datasets/language_pl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_pl.md -------------------------------------------------------------------------------- /docs/datasets/language_pms.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_pms.md -------------------------------------------------------------------------------- /docs/datasets/language_pnb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_pnb.md -------------------------------------------------------------------------------- /docs/datasets/language_ps.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ps.md -------------------------------------------------------------------------------- /docs/datasets/language_pt.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_pt.md -------------------------------------------------------------------------------- /docs/datasets/language_qu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_qu.md -------------------------------------------------------------------------------- /docs/datasets/language_ro.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ro.md -------------------------------------------------------------------------------- /docs/datasets/language_ru.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ru.md -------------------------------------------------------------------------------- /docs/datasets/language_rw.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_rw.md -------------------------------------------------------------------------------- /docs/datasets/language_sa.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sa.md -------------------------------------------------------------------------------- /docs/datasets/language_sah.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sah.md -------------------------------------------------------------------------------- /docs/datasets/language_sd.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sd.md -------------------------------------------------------------------------------- /docs/datasets/language_sh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sh.md -------------------------------------------------------------------------------- /docs/datasets/language_si.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_si.md -------------------------------------------------------------------------------- /docs/datasets/language_sk.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sk.md -------------------------------------------------------------------------------- /docs/datasets/language_sl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sl.md -------------------------------------------------------------------------------- /docs/datasets/language_sn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sn.md -------------------------------------------------------------------------------- /docs/datasets/language_so.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_so.md -------------------------------------------------------------------------------- /docs/datasets/language_sq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sq.md -------------------------------------------------------------------------------- /docs/datasets/language_sr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sr.md -------------------------------------------------------------------------------- /docs/datasets/language_st.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_st.md -------------------------------------------------------------------------------- /docs/datasets/language_su.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_su.md -------------------------------------------------------------------------------- /docs/datasets/language_sv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sv.md -------------------------------------------------------------------------------- /docs/datasets/language_sw.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_sw.md -------------------------------------------------------------------------------- /docs/datasets/language_ta.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ta.md -------------------------------------------------------------------------------- /docs/datasets/language_te.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_te.md -------------------------------------------------------------------------------- /docs/datasets/language_tg.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_tg.md -------------------------------------------------------------------------------- /docs/datasets/language_th.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_th.md -------------------------------------------------------------------------------- /docs/datasets/language_ti.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ti.md -------------------------------------------------------------------------------- /docs/datasets/language_tk.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_tk.md -------------------------------------------------------------------------------- /docs/datasets/language_tl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_tl.md -------------------------------------------------------------------------------- /docs/datasets/language_tr.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_tr.md -------------------------------------------------------------------------------- /docs/datasets/language_tt.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_tt.md -------------------------------------------------------------------------------- /docs/datasets/language_ug.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ug.md -------------------------------------------------------------------------------- /docs/datasets/language_uk.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_uk.md -------------------------------------------------------------------------------- /docs/datasets/language_ur.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_ur.md -------------------------------------------------------------------------------- /docs/datasets/language_uz.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_uz.md -------------------------------------------------------------------------------- /docs/datasets/language_vi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_vi.md -------------------------------------------------------------------------------- /docs/datasets/language_vo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_vo.md -------------------------------------------------------------------------------- /docs/datasets/language_wa.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_wa.md -------------------------------------------------------------------------------- /docs/datasets/language_war.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_war.md -------------------------------------------------------------------------------- /docs/datasets/language_wuu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_wuu.md -------------------------------------------------------------------------------- /docs/datasets/language_x-eml.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_x-eml.md -------------------------------------------------------------------------------- /docs/datasets/language_xal.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_xal.md -------------------------------------------------------------------------------- /docs/datasets/language_xh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_xh.md -------------------------------------------------------------------------------- /docs/datasets/language_xmf.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_xmf.md -------------------------------------------------------------------------------- /docs/datasets/language_yi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_yi.md -------------------------------------------------------------------------------- /docs/datasets/language_yo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_yo.md -------------------------------------------------------------------------------- /docs/datasets/language_zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_zh.md -------------------------------------------------------------------------------- /docs/datasets/language_zu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/language_zu.md -------------------------------------------------------------------------------- /docs/datasets/tokens_by_language.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/tokens_by_language.png -------------------------------------------------------------------------------- /docs/datasets/tokens_by_source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/datasets/tokens_by_source.png -------------------------------------------------------------------------------- /docs/extract-text-data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/extract-text-data.md -------------------------------------------------------------------------------- /docs/getting-started.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/getting-started.md -------------------------------------------------------------------------------- /docs/images/A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/images/A_colorful_parrot_sitting_on_a_pile_of_books__whit-removebg-preview.png -------------------------------------------------------------------------------- /docs/images/data-schema.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/images/data-schema.svg -------------------------------------------------------------------------------- /docs/images/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/images/favicon-16x16.png -------------------------------------------------------------------------------- /docs/images/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/images/favicon-32x32.png -------------------------------------------------------------------------------- /docs/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/images/favicon.ico -------------------------------------------------------------------------------- /docs/images/pipeline.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/images/pipeline.svg -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/index.md -------------------------------------------------------------------------------- /docs/integration-with-other-frameworks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/integration-with-other-frameworks.md -------------------------------------------------------------------------------- /docs/overview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/overview.md -------------------------------------------------------------------------------- /docs/related-work.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/docs/related-work.md -------------------------------------------------------------------------------- /examples/custom_datasets/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/examples/custom_datasets/README.md -------------------------------------------------------------------------------- /examples/custom_datasets/my_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/custom_datasets/my_datasets/csv_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/examples/custom_datasets/my_datasets/csv_example.py -------------------------------------------------------------------------------- /examples/custom_datasets/my_datasets/dataset_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/examples/custom_datasets/my_datasets/dataset_registry.py -------------------------------------------------------------------------------- /examples/custom_datasets/my_datasets/pg19.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/examples/custom_datasets/my_datasets/pg19.py -------------------------------------------------------------------------------- /examples/lm_datasets_configs/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/lm_datasets_configs/italian_data.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/examples/lm_datasets_configs/italian_data.yaml -------------------------------------------------------------------------------- /examples/lm_datasets_configs/my_system.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/examples/lm_datasets_configs/my_system.yaml -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/mkdocs.yml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/requirements.txt -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/requirements/base.txt -------------------------------------------------------------------------------- /requirements/datasets.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/requirements/datasets.txt -------------------------------------------------------------------------------- /requirements/datatrove.txt: -------------------------------------------------------------------------------- 1 | datatrove[all]>=0.2.0 -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/requirements/dev.txt -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/requirements/docs.txt -------------------------------------------------------------------------------- /requirements/megatron.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/requirements/megatron.txt -------------------------------------------------------------------------------- /requirements/viewer.txt: -------------------------------------------------------------------------------- 1 | 2 | # dataset viewer 3 | streamlit 4 | ngrok -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/setup.py -------------------------------------------------------------------------------- /src/llm_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.3" 2 | -------------------------------------------------------------------------------- /src/llm_datasets/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/__main__.py -------------------------------------------------------------------------------- /src/llm_datasets/chunkify_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/chunkify_datasets.py -------------------------------------------------------------------------------- /src/llm_datasets/collect_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/collect_metrics.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/__init__.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/chunkify_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/chunkify_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/collect_metrics_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/collect_metrics_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/compose_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/compose_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/convert_parquet_to_jsonl_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/convert_parquet_to_jsonl_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/exact_dedup_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/exact_dedup_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/extract_text_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/extract_text_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/hf_upload_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/hf_upload_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/print_stats_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/print_stats_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/render_docs_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/render_docs_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/shuffle_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/shuffle_command.py -------------------------------------------------------------------------------- /src/llm_datasets/commands/train_tokenizer_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/commands/train_tokenizer_command.py -------------------------------------------------------------------------------- /src/llm_datasets/compose_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/compose_dataset.py -------------------------------------------------------------------------------- /src/llm_datasets/convert_parquet_to_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/convert_parquet_to_jsonl.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/base.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/bg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/code/starcoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/code/starcoder.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/cs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/cs/cs_en_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/cs/cs_en_parallel.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/cs/syn_v9.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/cs/syn_v9.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/da/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/da/danewsroom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/da/danewsroom.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/da/danish_gigaword.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/da/danish_gigaword.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/da/danish_parliament_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/da/danish_parliament_corpus.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/da/dk_clarin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/da/dk_clarin.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/dataset_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/dataset_registry.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/de/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/de/de_laws.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/de/de_laws.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/de/dewac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/de/dewac.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/de/openlegaldata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/de/openlegaldata.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/el/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/el/greek_legal_code.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/el/greek_legal_code.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/el/greek_web_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/el/greek_web_corpus.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/dialogstudio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/dialogstudio.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/edgar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/edgar.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/math_amps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/math_amps.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/pes2o.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/pes2o.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/pile_of_law.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/pile_of_law.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/proof_pile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/proof_pile.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/en/wikihow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/en/wikihow.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/es/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/es/escorpius.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/es/escorpius.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/es/spanish_legal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/es/spanish_legal.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/et/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/et/ekspress.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/et/ekspress.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/et/enc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/et/enc.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/et/estonian_reference_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/et/estonian_reference_corpus.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/eu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/eu/euscrawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/eu/euscrawl.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/fi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/fi/ylenews.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/fi/ylenews.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/fr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/fr/cabernet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/fr/cabernet.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/fr/pleiasbooks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/fr/pleiasbooks.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/fr/pleiasnews.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/fr/pleiasnews.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/ga/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/ga/ga_bilingual_legistation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/ga/ga_bilingual_legistation.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/ga/ga_universal_dependencies.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/ga/ga_universal_dependencies.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/hf_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/hf_dataset.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/hr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/hr/croatian_news_engri.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/hr/croatian_news_engri.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/hr/hrwac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/hr/hrwac.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/hr/styria_news.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/hr/styria_news.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/it/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/it/gazzetta_ufficiale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/it/gazzetta_ufficiale.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/it/itwac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/it/itwac.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/it/paisa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/it/paisa.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/jsonl_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/jsonl_dataset.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/lt/seimas_lt_en.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/lt/seimas_lt_en.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/lv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/lv/state_related_latvian_web.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/lv/state_related_latvian_web.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/mt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/mt/korpus_malti.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/mt/korpus_malti.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/colossal_oscar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/colossal_oscar.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/curlicat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/curlicat.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/eurlex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/eurlex.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/legal_mc4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/legal_mc4.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/macocu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/macocu.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/redpajama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/redpajama.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/wikimedia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/wikimedia.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/multilingual/wura.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/multilingual/wura.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/nl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/nl/sonar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/nl/sonar.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/nl/sonar_new_media.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/nl/sonar_new_media.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/maalfrid_2021.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/maalfrid_2021.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/nak.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/nak.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/nbdigital.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/nbdigital.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/norwegian_cc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/norwegian_cc.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/parlamint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/parlamint.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/parliamentary_proceedings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/parliamentary_proceedings.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/no/sakspapir_nno.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/no/sakspapir_nno.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/parquet_dataset.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/pl/luna_pl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/pl/luna_pl.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/pl/pl_nkjp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/pl/pl_nkjp.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/pl/pl_parliamentary_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/pl/pl_parliamentary_corpus.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/pt/brwac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/pt/brwac.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/pt/parlamento_pt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/pt/parlamento_pt.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/ro/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/ro/marcell_legislative_subcorpus_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/ro/marcell_legislative_subcorpus_v2.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sk/sk_court_decisions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sk/sk_court_decisions.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sk/sk_laws.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sk/sk_laws.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sl/academic_slovene_kas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sl/academic_slovene_kas.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sl/cc_gigafida.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sl/cc_gigafida.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sl/slwac_web.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sl/slwac_web.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sr/srpkor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sr/srpkor.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/sv/sv_gigaword.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/sv/sv_gigaword.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/uk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/datasets/uk/ubertext_2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/uk/ubertext_2.py -------------------------------------------------------------------------------- /src/llm_datasets/datasets/uk/uk_laws.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datasets/uk/uk_laws.py -------------------------------------------------------------------------------- /src/llm_datasets/datatrove_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/datatrove_reader.py -------------------------------------------------------------------------------- /src/llm_datasets/dedup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/dedup/__init__.py -------------------------------------------------------------------------------- /src/llm_datasets/extract_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/extract_text.py -------------------------------------------------------------------------------- /src/llm_datasets/hf_tokenize_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/hf_tokenize_parquet_dataset.py -------------------------------------------------------------------------------- /src/llm_datasets/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/llm_datasets/io/conllu_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/io/conllu_file.py -------------------------------------------------------------------------------- /src/llm_datasets/io/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/io/parquet.py -------------------------------------------------------------------------------- /src/llm_datasets/io/prevert_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/io/prevert_file.py -------------------------------------------------------------------------------- /src/llm_datasets/megatron_tokenize_parquet_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/megatron_tokenize_parquet_dataset.py -------------------------------------------------------------------------------- /src/llm_datasets/print_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/print_stats.py -------------------------------------------------------------------------------- /src/llm_datasets/shuffle_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/shuffle_datasets.py -------------------------------------------------------------------------------- /src/llm_datasets/train_sp_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/train_sp_tokenizer.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/__init__.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/config.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/dataframe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/dataframe.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/dataset_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/dataset_generator.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/docs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/docs/__init__.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/docs/plots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/docs/plots.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/docs/tables.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/docs/tables.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/flatmap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/flatmap.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/languages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/languages.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/settings.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/shuffle_big_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/shuffle_big_file.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/systems.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/systems.py -------------------------------------------------------------------------------- /src/llm_datasets/utils/wikimedia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/utils/wikimedia.py -------------------------------------------------------------------------------- /src/llm_datasets/viewer/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/viewer/app.py -------------------------------------------------------------------------------- /src/llm_datasets/viewer/ngrok-app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/viewer/ngrok-app.py -------------------------------------------------------------------------------- /src/llm_datasets/viewer/viewer_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/src/llm_datasets/viewer/viewer_utils.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/dummy_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/dummy_datasets.py -------------------------------------------------------------------------------- /tests/fixtures/configs/dummy_config.yml: -------------------------------------------------------------------------------- 1 | # this is a dummy config for testing 2 | seed: 42 -------------------------------------------------------------------------------- /tests/fixtures/oscar_2301_texts_and_hashes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/fixtures/oscar_2301_texts_and_hashes.json -------------------------------------------------------------------------------- /tests/test_compose_dataset_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_compose_dataset_benchmark.py -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_config.py -------------------------------------------------------------------------------- /tests/test_datatrove_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_datatrove_reader.py -------------------------------------------------------------------------------- /tests/test_document_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_document_datasets.py -------------------------------------------------------------------------------- /tests/test_generate_texts_from_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_generate_texts_from_output.py -------------------------------------------------------------------------------- /tests/test_interleave_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_interleave_datasets.py -------------------------------------------------------------------------------- /tests/test_iterate_over_shuffled_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_iterate_over_shuffled_datasets.py -------------------------------------------------------------------------------- /tests/test_split_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_split_dataset.py -------------------------------------------------------------------------------- /tests/test_tlsh_hashes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_tlsh_hashes.py -------------------------------------------------------------------------------- /tests/test_write_parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_write_parquet.py -------------------------------------------------------------------------------- /tests/test_write_parquet_chunks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/llm-datasets/HEAD/tests/test_write_parquet_chunks.py --------------------------------------------------------------------------------