├── .env.template ├── .gitignore ├── LICENSE ├── README.md ├── constants ├── continent_country_iso.json ├── creator_groups.json ├── creator_groups_by_country.json ├── custom_license_classes.json ├── data_formats.json ├── domain_groups.json ├── domain_types.json ├── language_families.json ├── language_groups.json ├── license_classes.json ├── license_paraphrases.json ├── model_groups.json ├── source_name_mapper.json ├── task_groups.json └── topic_groups.json ├── data_summaries-speech ├── 1111 Hours Hindi ASR Challenge.json ├── 120h Spanish Speech.json ├── AFRISPEECH-200.json ├── AISHELL-1.json ├── AISHELL-2.json ├── AISHELL-4.json ├── ALLSSTAR.json ├── AMI.json ├── Aalto Finnish Parliament.json ├── African Accented French.json ├── Basque, Catalan and Galician.json ├── Bloom Speech.json ├── Bud500.json ├── CSJ.json ├── CSLU 22 Languages Corpus.json ├── CSLU Foreign Accented English Release 1.2.json ├── ClarinPL.json ├── CoNASE.json ├── CoVoST-2.json ├── Common Voice Corpus (17.0).json ├── Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali.json ├── Czech Parliament.json ├── DiDiSpeech.json ├── Earnings-22.json ├── EdAcc.json ├── English Accents in the British Isles.json ├── English-Vietnamese.json ├── FT SPEECH.json ├── Fisher.json ├── Fleurs.json ├── GigaSpeech.json ├── Golos.json ├── Hebrew Speech Coursera.json ├── Hebrew Speech Kan.json ├── Highland Puebla Nahuatl.json ├── JTUBESPEECH.json ├── Japanese Anime Speech.json ├── KSC.json ├── Kathbath.json ├── KeSpeech.json ├── KsponSpeech.json ├── LJSpeech.json ├── LaboroTVSpeech.json ├── LibriSpeech.json ├── M-AILABS.json ├── M2ASR-Kazakh.json ├── M2ASR-Kirghiz.json ├── M2ASR-Mongo.json ├── M2ASR-Tibetan.json ├── MAGICDATA.json ├── MASC.json ├── MaSS.json ├── MagicData-RAMC.json ├── MediaSpeech.json ├── Minds14.json ├── MuST-C.json ├── Multilingual LibriSpeech.json ├── Multilingual TEDx.json ├── NPSC.json ├── NST Danish.json ├── NST Norwegian.json ├── NST Swedish.json ├── Nigerian English.json ├── Norwegian Parliamentary.json ├── OLKAVS.json ├── OpenSTT.json ├── People's Speech.json ├── QASR.json ├── RTVE.json ├── ReazonSpeech.json ├── Regional African American Language.json ├── RixVox.json ├── SDS-200.json ├── SPGISpeech.json ├── Samrómur Children.json ├── Samrómur Milljón.json ├── Samrómur.json ├── Shrutilipi.json ├── Snow Mountain.json ├── Spoken Wikipedia.json ├── Switchboard.json ├── TED-LIUM3.json ├── THCHS-30.json ├── THUYG-20.json ├── TIMIT.json ├── VCTK.json ├── VibraVox.json ├── VoxPopuli.json ├── Vystadial.json ├── WenetSpeech.json ├── West African Radio Corpus.json ├── West African Virtual Assistant Speech Recognition Corpus.json ├── YODAS.json ├── Zeroth-Korean.json └── aidatatang.json ├── data_summaries-video ├── 100doh.json ├── 20BN-jester.json ├── 20bn-something.json ├── 50salads.json ├── WebVid.json ├── YT-Temporal-180m.json ├── YT-Temporal-1B.json ├── activitynet.json ├── apes.json ├── ava-dataset.json ├── ava.json ├── breakfast.json ├── cacd.json ├── casualconv.json ├── charades-ego.json ├── charades.json ├── cinepile.json ├── coin-dataset.json ├── collective.json ├── condensed-movies.json ├── crosstask.json ├── davis.json ├── didemo.json ├── disney-videogeneration-dataset.json ├── eev-dataset.json ├── ego-4d.json ├── ego-exo4D.json ├── egopet.json ├── egoschema.json ├── epic-kitchenes.json ├── ferv39k-dataset.json ├── finegym-dataset.json ├── finevideo.json ├── haa500-dataset.json ├── hacs-dataset.json ├── hd-vila-100m.json ├── hmdb-dataset.json ├── hollywood-extended.json ├── hollywood2-dataset.json ├── homage.json ├── how2.json ├── howto100m.json ├── hyu-vids.json ├── imagenet-vid.json ├── kinetics-400.json ├── kinetics-600.json ├── kinetics-700.json ├── lemma-dataset.json ├── lsmdc-ordering.json ├── lsmdc.json ├── mad.json ├── mars.json ├── mimetics-dataset.json ├── mmact.json ├── moments-in-time-dataset.json ├── movie-net.json ├── moviegraphs.json ├── movieqa.json ├── moviescenes.json ├── mpii-cooking.json ├── mpii-cooking2.json ├── mpii-md.json ├── msa.json ├── msr-vtt.json ├── multi-moments-in-time-dataset.json ├── multi-thumos-challenge.json ├── mvbench.json ├── narrated-instruction-vids.json ├── ntu-rgbd.json ├── omnisource-web-dataset.json ├── oops-dataset.json ├── openvid-1M.json ├── pku-mmd-dataset.json ├── project-aria-dataset.json ├── project-aria-digital-twin-dataset.json ├── qfvs.json ├── queryd.json ├── rare-act-dataset.json ├── sharegpt4video.json ├── soa-dataset.json ├── spoken-moments.json ├── sports1M-dataset.json ├── stroygraphs.json ├── summe.json ├── tgif.json ├── thumos-challenge.json ├── tiny-virat.json ├── titan.json ├── toyota-smarthome.json ├── trecvid.json ├── tvsum.json ├── uav-human.json ├── ucf101-dataset.json ├── vatex.json ├── videolt-dataset.json ├── videostory.json ├── vidprom.json ├── violin.json ├── vlog-vids.json ├── volleyball-vids.json ├── voxceleb.json ├── vtw.json ├── youcook-2.json ├── youcook.json └── youtube-8m.json ├── data_summaries ├── 10k Prompt Ranked.json ├── AgentInstruct.json ├── Airoboros.json ├── Alpaca.json ├── Anthropic HH-RLHF.json ├── Aya Dataset.json ├── Bactrian-X.json ├── Baize Chat Data.json ├── Book Summaries.json ├── COIG-CQIA.json ├── COIG.json ├── Camel-AI Science.json ├── Capybara.json ├── ChatDoctor.json ├── ChatbotArena.json ├── Cidar.json ├── CoT Collection.json ├── Cobra Frames.json ├── Code Alpaca.json ├── CollectiveCognition.json ├── CommitPackFT.json ├── Conifer.json ├── Deita 10K.json ├── DialogStudio.json ├── Dolly 15k.json ├── Dynosaur.json ├── EverythingLM.json ├── ExpertQA.json ├── Feedback Collection.json ├── Flan Collection (Chain-of-Thought).json ├── Flan Collection (Dialog).json ├── Flan Collection (Flan 2021).json ├── Flan Collection (P3).json ├── Flan Collection (Super-NaturalInstructions).json ├── GPT-4-Alpaca.json ├── GPTeacher.json ├── Glaive Code Assistant v2.json ├── Glaive Code Assistant v3.json ├── Glaive Code Assistant.json ├── Gorilla.json ├── Gretel Text-to-SQL.json ├── HC3 (Chinese).json ├── HC3 (English).json ├── HelpSteer.json ├── Indic-Instruct.json ├── InstAr.json ├── Joke Explanation.json ├── KIWI.json ├── LIMA.json ├── Llama2-MedTuned-Instructions.json ├── LongAlign-10k.json ├── Longform.json ├── Lumos Grounding.json ├── Lumos Planning.json ├── Magpie-Pro.json ├── MathDial.json ├── MathInstruct.json ├── MedInstruct.json ├── Medical Meadow.json ├── MegaWika.json ├── MetaMathQA.json ├── Nectar.json ├── No Robots.json ├── NomicAI GPT4AllJ.json ├── OIG.json ├── Open Assistant OctoPack.json ├── Open Assistant v2.json ├── Open Assistant.json ├── Open Orca.json ├── Open-Platypus.json ├── OpenAI (Summarize from Feedback).json ├── OpenAI (WebGPT).json ├── OpenGPT Healthcare.json ├── OpenMathInstruct-1.json ├── Orca-Math.json ├── PII-Masking-200k.json ├── PII-masking-200k.json ├── PMC-LLaMA Instructions.json ├── Preference Collection.json ├── Pure-Dove.json ├── PygmalionAI-PIPPA.json ├── Reasoning.json ├── RiddleSense.json ├── SeaBench.json ├── Seacrowd.json ├── SelFee.json ├── Self-Instruct.json ├── ShareGPT Vicuna.json ├── Stack Exchange Instruction.json ├── Stanford Human Preferences.json ├── StarCoder Self-Instruct.json ├── Synthetic-GSM8K-Reflection.json ├── Tasksource Instruct.json ├── Tasksource Symbol-Tuning.json ├── Thai Gen AI (Alpaca).json ├── Thai Gen AI (Dolly).json ├── Thai Gen AI (GPTeacher).json ├── Tiny Stories.json ├── Tool-Llama.json ├── ToxicChat.json ├── UltraChat.json ├── UltraChat_200k.json ├── UltraFeedback Argilla.json ├── Unnatural Instructions.json ├── WildChat.json ├── WizardLM Evol-Instruct V2.json ├── WizardLM Evol-Instruct.json ├── _template.json ├── _template_spec.yaml ├── lmsys_chat_1m.json └── xP3x.json ├── dpi-plots └── video │ ├── video_creatorcategories-years.png │ ├── video_sourcecategories-cumulativehours.png │ ├── video_sourcecategories-years.png │ ├── video_sourcecategories-yearscombined.png │ ├── video_sources-licenses.png │ └── video_taskcategories-years.png ├── dpi-plotsmultimodal ├── creator_categories_by_modality.png ├── dataset_count_by_continent_and_modality.png ├── dataset_count_by_country_and_modality.png ├── license_use_by_modality_collections.png ├── multimodal-combined_chart.png ├── source_categories_by_modality-aggregated.png └── source_categories_by_modality.png ├── dpi.png ├── requirements.txt └── src ├── __init__.py ├── analysis ├── README.md ├── agents_table.py ├── aggregate.py ├── analysis_constants.py ├── analysis_util.py ├── corpus_robots_trends.ipynb ├── data │ ├── agents_counter │ │ └── all_agents_counter.csv │ ├── multimodal_terms_data │ │ ├── speech.csv │ │ ├── text.csv │ │ └── video.csv │ ├── pretrain_data │ │ ├── corpus_token_bucket_counts │ │ │ ├── c4_buckets.csv │ │ │ ├── dolma_buckets.csv │ │ │ └── rf_buckets.csv │ │ └── relevant_url_token_counts.csv │ └── speech_supporting_data │ │ ├── bloomspeech_splithours.csv │ │ ├── commonvoice_splithours.json │ │ ├── fleurs_splithours.csv │ │ ├── languages.csv │ │ ├── multilinguallibrispeech_splithours.csv │ │ └── yodas_splithours.csv ├── market_analysis.ipynb ├── multimodal_analysis.ipynb ├── multimodal_data_aggregator.ipynb ├── multimodal_util.py ├── paywall_domain_analysis.ipynb ├── prompt_analysis.ipynb ├── prompt_domain_analysis.py ├── robots_analysis-tables-confusion-matrices-will.ipynb ├── robots_analysis.ipynb ├── robots_analysis_p2.ipynb ├── speech_analysis.ipynb ├── text_ft_plots.ipynb ├── video_analysis.ipynb └── visualization_util.py ├── collection_mapper.py ├── configs ├── README.md ├── commercial_licenses.yaml ├── commercial_licenses_and_terms.yaml ├── commercial_or_unspecified_licenses.yaml ├── commercial_or_unspecified_licenses_and_terms.yaml ├── common_pile_datasets.txt ├── common_pile_ultra_permissive.yaml └── default.yaml ├── data_bibtex.py ├── data_provenance_card.py ├── download_and_filter.py ├── downloader.py ├── downloaders.py ├── helpers ├── __init__.py ├── constants.py ├── filters.py └── io.py ├── preparers.py ├── scripts ├── README.md ├── annotate_text_stats.py └── infer_metadata.py ├── summary-tables ├── README.md ├── collections-audio.ipynb ├── collections-text-v1paper.ipynb ├── collections-text.ipynb ├── collections-video.ipynb ├── emoji │ ├── CommercialDataCircle.pdf │ ├── NCDataCircle.pdf │ ├── UnspecifiedDataCircle.pdf │ ├── globe-with-meridians.pdf │ ├── greencheck.pdf │ ├── redcross.pdf │ └── robot.pdf ├── hf_downloads.csv ├── papers.csv ├── refs-licenses.ipynb └── utils.py ├── test_new_collection.py └── web_analysis ├── README.md ├── data ├── IP2LOCATION-LITE-DB1.IPV6.BIN ├── _top_2000_c4_token_and_urlcounts - top_2000_c4_token_and_urlcounts.csv ├── failed_responses.json ├── gpt-response-cache.json ├── gpt-response-failed.json └── prompt_templates.json ├── downloading_web ├── README.md ├── c4_hf_streaming.py ├── dolma_download.py └── refinedweb_download.py ├── extract_robots.py ├── forecasting_util.py ├── gpt.py ├── gpt_tos_analysis.py ├── parse_robots.py ├── requirements_gpt_tos_analysis.txt ├── requirements_website_geolocation.txt ├── robots_util.py ├── test_robots.py ├── wayback_extraction ├── __init__.py ├── file_utils.py ├── requirements_wayback.txt ├── temporal_pipeline.py └── wayback_cdx.py └── website_geolocation.py /.env.template: -------------------------------------------------------------------------------- 1 | HF_TOKEN=your_hugging_face_token -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/README.md -------------------------------------------------------------------------------- /constants/continent_country_iso.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/continent_country_iso.json -------------------------------------------------------------------------------- /constants/creator_groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/creator_groups.json -------------------------------------------------------------------------------- /constants/creator_groups_by_country.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/creator_groups_by_country.json -------------------------------------------------------------------------------- /constants/custom_license_classes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/custom_license_classes.json -------------------------------------------------------------------------------- /constants/data_formats.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/data_formats.json -------------------------------------------------------------------------------- /constants/domain_groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/domain_groups.json -------------------------------------------------------------------------------- /constants/domain_types.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/domain_types.json -------------------------------------------------------------------------------- /constants/language_families.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/language_families.json -------------------------------------------------------------------------------- /constants/language_groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/language_groups.json -------------------------------------------------------------------------------- /constants/license_classes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/license_classes.json -------------------------------------------------------------------------------- /constants/license_paraphrases.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/license_paraphrases.json -------------------------------------------------------------------------------- /constants/model_groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/model_groups.json -------------------------------------------------------------------------------- /constants/source_name_mapper.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/source_name_mapper.json -------------------------------------------------------------------------------- /constants/task_groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/task_groups.json -------------------------------------------------------------------------------- /constants/topic_groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/constants/topic_groups.json -------------------------------------------------------------------------------- /data_summaries-speech/1111 Hours Hindi ASR Challenge.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/1111 Hours Hindi ASR Challenge.json -------------------------------------------------------------------------------- /data_summaries-speech/120h Spanish Speech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/120h Spanish Speech.json -------------------------------------------------------------------------------- /data_summaries-speech/AFRISPEECH-200.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/AFRISPEECH-200.json -------------------------------------------------------------------------------- /data_summaries-speech/AISHELL-1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/AISHELL-1.json -------------------------------------------------------------------------------- /data_summaries-speech/AISHELL-2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/AISHELL-2.json -------------------------------------------------------------------------------- /data_summaries-speech/AISHELL-4.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/AISHELL-4.json -------------------------------------------------------------------------------- /data_summaries-speech/ALLSSTAR.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/ALLSSTAR.json -------------------------------------------------------------------------------- /data_summaries-speech/AMI.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/AMI.json -------------------------------------------------------------------------------- /data_summaries-speech/Aalto Finnish Parliament.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Aalto Finnish Parliament.json -------------------------------------------------------------------------------- /data_summaries-speech/African Accented French.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/African Accented French.json -------------------------------------------------------------------------------- /data_summaries-speech/Basque, Catalan and Galician.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Basque, Catalan and Galician.json -------------------------------------------------------------------------------- /data_summaries-speech/Bloom Speech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Bloom Speech.json -------------------------------------------------------------------------------- /data_summaries-speech/Bud500.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Bud500.json -------------------------------------------------------------------------------- /data_summaries-speech/CSJ.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/CSJ.json -------------------------------------------------------------------------------- /data_summaries-speech/CSLU 22 Languages Corpus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/CSLU 22 Languages Corpus.json -------------------------------------------------------------------------------- /data_summaries-speech/CSLU Foreign Accented English Release 1.2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/CSLU Foreign Accented English Release 1.2.json -------------------------------------------------------------------------------- /data_summaries-speech/ClarinPL.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/ClarinPL.json -------------------------------------------------------------------------------- /data_summaries-speech/CoNASE.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/CoNASE.json -------------------------------------------------------------------------------- /data_summaries-speech/CoVoST-2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/CoVoST-2.json -------------------------------------------------------------------------------- /data_summaries-speech/Common Voice Corpus (17.0).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Common Voice Corpus (17.0).json -------------------------------------------------------------------------------- /data_summaries-speech/Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali.json -------------------------------------------------------------------------------- /data_summaries-speech/Czech Parliament.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Czech Parliament.json -------------------------------------------------------------------------------- /data_summaries-speech/DiDiSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/DiDiSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/Earnings-22.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Earnings-22.json -------------------------------------------------------------------------------- /data_summaries-speech/EdAcc.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/EdAcc.json -------------------------------------------------------------------------------- /data_summaries-speech/English Accents in the British Isles.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/English Accents in the British Isles.json -------------------------------------------------------------------------------- /data_summaries-speech/English-Vietnamese.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/English-Vietnamese.json -------------------------------------------------------------------------------- /data_summaries-speech/FT SPEECH.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/FT SPEECH.json -------------------------------------------------------------------------------- /data_summaries-speech/Fisher.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Fisher.json -------------------------------------------------------------------------------- /data_summaries-speech/Fleurs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Fleurs.json -------------------------------------------------------------------------------- /data_summaries-speech/GigaSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/GigaSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/Golos.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Golos.json -------------------------------------------------------------------------------- /data_summaries-speech/Hebrew Speech Coursera.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Hebrew Speech Coursera.json -------------------------------------------------------------------------------- /data_summaries-speech/Hebrew Speech Kan.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Hebrew Speech Kan.json -------------------------------------------------------------------------------- /data_summaries-speech/Highland Puebla Nahuatl.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Highland Puebla Nahuatl.json -------------------------------------------------------------------------------- /data_summaries-speech/JTUBESPEECH.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/JTUBESPEECH.json -------------------------------------------------------------------------------- /data_summaries-speech/Japanese Anime Speech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Japanese Anime Speech.json -------------------------------------------------------------------------------- /data_summaries-speech/KSC.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/KSC.json -------------------------------------------------------------------------------- /data_summaries-speech/Kathbath.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Kathbath.json -------------------------------------------------------------------------------- /data_summaries-speech/KeSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/KeSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/KsponSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/KsponSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/LJSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/LJSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/LaboroTVSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/LaboroTVSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/LibriSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/LibriSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/M-AILABS.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/M-AILABS.json -------------------------------------------------------------------------------- /data_summaries-speech/M2ASR-Kazakh.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/M2ASR-Kazakh.json -------------------------------------------------------------------------------- /data_summaries-speech/M2ASR-Kirghiz.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/M2ASR-Kirghiz.json -------------------------------------------------------------------------------- /data_summaries-speech/M2ASR-Mongo.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/M2ASR-Mongo.json -------------------------------------------------------------------------------- /data_summaries-speech/M2ASR-Tibetan.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/M2ASR-Tibetan.json -------------------------------------------------------------------------------- /data_summaries-speech/MAGICDATA.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/MAGICDATA.json -------------------------------------------------------------------------------- /data_summaries-speech/MASC.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/MASC.json -------------------------------------------------------------------------------- /data_summaries-speech/MaSS.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/MaSS.json -------------------------------------------------------------------------------- /data_summaries-speech/MagicData-RAMC.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/MagicData-RAMC.json -------------------------------------------------------------------------------- /data_summaries-speech/MediaSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/MediaSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/Minds14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Minds14.json -------------------------------------------------------------------------------- /data_summaries-speech/MuST-C.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/MuST-C.json -------------------------------------------------------------------------------- /data_summaries-speech/Multilingual LibriSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Multilingual LibriSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/Multilingual TEDx.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Multilingual TEDx.json -------------------------------------------------------------------------------- /data_summaries-speech/NPSC.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/NPSC.json -------------------------------------------------------------------------------- /data_summaries-speech/NST Danish.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/NST Danish.json -------------------------------------------------------------------------------- /data_summaries-speech/NST Norwegian.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/NST Norwegian.json -------------------------------------------------------------------------------- /data_summaries-speech/NST Swedish.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/NST Swedish.json -------------------------------------------------------------------------------- /data_summaries-speech/Nigerian English.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Nigerian English.json -------------------------------------------------------------------------------- /data_summaries-speech/Norwegian Parliamentary.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Norwegian Parliamentary.json -------------------------------------------------------------------------------- /data_summaries-speech/OLKAVS.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/OLKAVS.json -------------------------------------------------------------------------------- /data_summaries-speech/OpenSTT.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/OpenSTT.json -------------------------------------------------------------------------------- /data_summaries-speech/People's Speech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/People's Speech.json -------------------------------------------------------------------------------- /data_summaries-speech/QASR.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/QASR.json -------------------------------------------------------------------------------- /data_summaries-speech/RTVE.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/RTVE.json -------------------------------------------------------------------------------- /data_summaries-speech/ReazonSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/ReazonSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/Regional African American Language.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Regional African American Language.json -------------------------------------------------------------------------------- /data_summaries-speech/RixVox.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/RixVox.json -------------------------------------------------------------------------------- /data_summaries-speech/SDS-200.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/SDS-200.json -------------------------------------------------------------------------------- /data_summaries-speech/SPGISpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/SPGISpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/Samrómur Children.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Samrómur Children.json -------------------------------------------------------------------------------- /data_summaries-speech/Samrómur Milljón.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Samrómur Milljón.json -------------------------------------------------------------------------------- /data_summaries-speech/Samrómur.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Samrómur.json -------------------------------------------------------------------------------- /data_summaries-speech/Shrutilipi.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Shrutilipi.json -------------------------------------------------------------------------------- /data_summaries-speech/Snow Mountain.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Snow Mountain.json -------------------------------------------------------------------------------- /data_summaries-speech/Spoken Wikipedia.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Spoken Wikipedia.json -------------------------------------------------------------------------------- /data_summaries-speech/Switchboard.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Switchboard.json -------------------------------------------------------------------------------- /data_summaries-speech/TED-LIUM3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/TED-LIUM3.json -------------------------------------------------------------------------------- /data_summaries-speech/THCHS-30.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/THCHS-30.json -------------------------------------------------------------------------------- /data_summaries-speech/THUYG-20.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/THUYG-20.json -------------------------------------------------------------------------------- /data_summaries-speech/TIMIT.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/TIMIT.json -------------------------------------------------------------------------------- /data_summaries-speech/VCTK.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/VCTK.json -------------------------------------------------------------------------------- /data_summaries-speech/VibraVox.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/VibraVox.json -------------------------------------------------------------------------------- /data_summaries-speech/VoxPopuli.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/VoxPopuli.json -------------------------------------------------------------------------------- /data_summaries-speech/Vystadial.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Vystadial.json -------------------------------------------------------------------------------- /data_summaries-speech/WenetSpeech.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/WenetSpeech.json -------------------------------------------------------------------------------- /data_summaries-speech/West African Radio Corpus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/West African Radio Corpus.json -------------------------------------------------------------------------------- /data_summaries-speech/West African Virtual Assistant Speech Recognition Corpus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/West African Virtual Assistant Speech Recognition Corpus.json -------------------------------------------------------------------------------- /data_summaries-speech/YODAS.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/YODAS.json -------------------------------------------------------------------------------- /data_summaries-speech/Zeroth-Korean.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/Zeroth-Korean.json -------------------------------------------------------------------------------- /data_summaries-speech/aidatatang.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-speech/aidatatang.json -------------------------------------------------------------------------------- /data_summaries-video/100doh.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/100doh.json -------------------------------------------------------------------------------- /data_summaries-video/20BN-jester.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/20BN-jester.json -------------------------------------------------------------------------------- /data_summaries-video/20bn-something.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/20bn-something.json -------------------------------------------------------------------------------- /data_summaries-video/50salads.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/50salads.json -------------------------------------------------------------------------------- /data_summaries-video/WebVid.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/WebVid.json -------------------------------------------------------------------------------- /data_summaries-video/YT-Temporal-180m.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/YT-Temporal-180m.json -------------------------------------------------------------------------------- /data_summaries-video/YT-Temporal-1B.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/YT-Temporal-1B.json -------------------------------------------------------------------------------- /data_summaries-video/activitynet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/activitynet.json -------------------------------------------------------------------------------- /data_summaries-video/apes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/apes.json -------------------------------------------------------------------------------- /data_summaries-video/ava-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ava-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/ava.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ava.json -------------------------------------------------------------------------------- /data_summaries-video/breakfast.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/breakfast.json -------------------------------------------------------------------------------- /data_summaries-video/cacd.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/cacd.json -------------------------------------------------------------------------------- /data_summaries-video/casualconv.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/casualconv.json -------------------------------------------------------------------------------- /data_summaries-video/charades-ego.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/charades-ego.json -------------------------------------------------------------------------------- /data_summaries-video/charades.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/charades.json -------------------------------------------------------------------------------- /data_summaries-video/cinepile.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/cinepile.json -------------------------------------------------------------------------------- /data_summaries-video/coin-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/coin-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/collective.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/collective.json -------------------------------------------------------------------------------- /data_summaries-video/condensed-movies.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/condensed-movies.json -------------------------------------------------------------------------------- /data_summaries-video/crosstask.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/crosstask.json -------------------------------------------------------------------------------- /data_summaries-video/davis.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/davis.json -------------------------------------------------------------------------------- /data_summaries-video/didemo.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/didemo.json -------------------------------------------------------------------------------- /data_summaries-video/disney-videogeneration-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/disney-videogeneration-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/eev-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/eev-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/ego-4d.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ego-4d.json -------------------------------------------------------------------------------- /data_summaries-video/ego-exo4D.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ego-exo4D.json -------------------------------------------------------------------------------- /data_summaries-video/egopet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/egopet.json -------------------------------------------------------------------------------- /data_summaries-video/egoschema.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/egoschema.json -------------------------------------------------------------------------------- /data_summaries-video/epic-kitchenes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/epic-kitchenes.json -------------------------------------------------------------------------------- /data_summaries-video/ferv39k-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ferv39k-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/finegym-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/finegym-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/finevideo.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/finevideo.json -------------------------------------------------------------------------------- /data_summaries-video/haa500-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/haa500-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/hacs-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/hacs-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/hd-vila-100m.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/hd-vila-100m.json -------------------------------------------------------------------------------- /data_summaries-video/hmdb-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/hmdb-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/hollywood-extended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/hollywood-extended.json -------------------------------------------------------------------------------- /data_summaries-video/hollywood2-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/hollywood2-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/homage.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/homage.json -------------------------------------------------------------------------------- /data_summaries-video/how2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/how2.json -------------------------------------------------------------------------------- /data_summaries-video/howto100m.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/howto100m.json -------------------------------------------------------------------------------- /data_summaries-video/hyu-vids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/hyu-vids.json -------------------------------------------------------------------------------- /data_summaries-video/imagenet-vid.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/imagenet-vid.json -------------------------------------------------------------------------------- /data_summaries-video/kinetics-400.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/kinetics-400.json -------------------------------------------------------------------------------- /data_summaries-video/kinetics-600.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/kinetics-600.json -------------------------------------------------------------------------------- /data_summaries-video/kinetics-700.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/kinetics-700.json -------------------------------------------------------------------------------- /data_summaries-video/lemma-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/lemma-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/lsmdc-ordering.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/lsmdc-ordering.json -------------------------------------------------------------------------------- /data_summaries-video/lsmdc.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/lsmdc.json -------------------------------------------------------------------------------- /data_summaries-video/mad.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mad.json -------------------------------------------------------------------------------- /data_summaries-video/mars.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mars.json -------------------------------------------------------------------------------- /data_summaries-video/mimetics-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mimetics-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/mmact.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mmact.json -------------------------------------------------------------------------------- /data_summaries-video/moments-in-time-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/moments-in-time-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/movie-net.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/movie-net.json -------------------------------------------------------------------------------- /data_summaries-video/moviegraphs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/moviegraphs.json -------------------------------------------------------------------------------- /data_summaries-video/movieqa.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/movieqa.json -------------------------------------------------------------------------------- /data_summaries-video/moviescenes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/moviescenes.json -------------------------------------------------------------------------------- /data_summaries-video/mpii-cooking.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mpii-cooking.json -------------------------------------------------------------------------------- /data_summaries-video/mpii-cooking2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mpii-cooking2.json -------------------------------------------------------------------------------- /data_summaries-video/mpii-md.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mpii-md.json -------------------------------------------------------------------------------- /data_summaries-video/msa.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/msa.json -------------------------------------------------------------------------------- /data_summaries-video/msr-vtt.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/msr-vtt.json -------------------------------------------------------------------------------- /data_summaries-video/multi-moments-in-time-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/multi-moments-in-time-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/multi-thumos-challenge.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/multi-thumos-challenge.json -------------------------------------------------------------------------------- /data_summaries-video/mvbench.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/mvbench.json -------------------------------------------------------------------------------- /data_summaries-video/narrated-instruction-vids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/narrated-instruction-vids.json -------------------------------------------------------------------------------- /data_summaries-video/ntu-rgbd.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ntu-rgbd.json -------------------------------------------------------------------------------- /data_summaries-video/omnisource-web-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/omnisource-web-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/oops-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/oops-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/openvid-1M.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/openvid-1M.json -------------------------------------------------------------------------------- /data_summaries-video/pku-mmd-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/pku-mmd-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/project-aria-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/project-aria-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/project-aria-digital-twin-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/project-aria-digital-twin-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/qfvs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/qfvs.json -------------------------------------------------------------------------------- /data_summaries-video/queryd.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/queryd.json -------------------------------------------------------------------------------- /data_summaries-video/rare-act-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/rare-act-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/sharegpt4video.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/sharegpt4video.json -------------------------------------------------------------------------------- /data_summaries-video/soa-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/soa-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/spoken-moments.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/spoken-moments.json -------------------------------------------------------------------------------- /data_summaries-video/sports1M-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/sports1M-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/stroygraphs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/stroygraphs.json -------------------------------------------------------------------------------- /data_summaries-video/summe.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/summe.json -------------------------------------------------------------------------------- /data_summaries-video/tgif.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/tgif.json -------------------------------------------------------------------------------- /data_summaries-video/thumos-challenge.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/thumos-challenge.json -------------------------------------------------------------------------------- /data_summaries-video/tiny-virat.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/tiny-virat.json -------------------------------------------------------------------------------- /data_summaries-video/titan.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/titan.json -------------------------------------------------------------------------------- /data_summaries-video/toyota-smarthome.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/toyota-smarthome.json -------------------------------------------------------------------------------- /data_summaries-video/trecvid.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/trecvid.json -------------------------------------------------------------------------------- /data_summaries-video/tvsum.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/tvsum.json -------------------------------------------------------------------------------- /data_summaries-video/uav-human.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/uav-human.json -------------------------------------------------------------------------------- /data_summaries-video/ucf101-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/ucf101-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/vatex.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/vatex.json -------------------------------------------------------------------------------- /data_summaries-video/videolt-dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/videolt-dataset.json -------------------------------------------------------------------------------- /data_summaries-video/videostory.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/videostory.json -------------------------------------------------------------------------------- /data_summaries-video/vidprom.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/vidprom.json -------------------------------------------------------------------------------- /data_summaries-video/violin.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/violin.json -------------------------------------------------------------------------------- /data_summaries-video/vlog-vids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/vlog-vids.json -------------------------------------------------------------------------------- /data_summaries-video/volleyball-vids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/volleyball-vids.json -------------------------------------------------------------------------------- /data_summaries-video/voxceleb.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/voxceleb.json -------------------------------------------------------------------------------- /data_summaries-video/vtw.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/vtw.json -------------------------------------------------------------------------------- /data_summaries-video/youcook-2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/youcook-2.json -------------------------------------------------------------------------------- /data_summaries-video/youcook.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/youcook.json -------------------------------------------------------------------------------- /data_summaries-video/youtube-8m.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries-video/youtube-8m.json -------------------------------------------------------------------------------- /data_summaries/10k Prompt Ranked.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/10k Prompt Ranked.json -------------------------------------------------------------------------------- /data_summaries/AgentInstruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/AgentInstruct.json -------------------------------------------------------------------------------- /data_summaries/Airoboros.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Airoboros.json -------------------------------------------------------------------------------- /data_summaries/Alpaca.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Alpaca.json -------------------------------------------------------------------------------- /data_summaries/Anthropic HH-RLHF.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Anthropic HH-RLHF.json -------------------------------------------------------------------------------- /data_summaries/Aya Dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Aya Dataset.json -------------------------------------------------------------------------------- /data_summaries/Bactrian-X.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Bactrian-X.json -------------------------------------------------------------------------------- /data_summaries/Baize Chat Data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Baize Chat Data.json -------------------------------------------------------------------------------- /data_summaries/Book Summaries.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Book Summaries.json -------------------------------------------------------------------------------- /data_summaries/COIG-CQIA.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/COIG-CQIA.json -------------------------------------------------------------------------------- /data_summaries/COIG.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/COIG.json -------------------------------------------------------------------------------- /data_summaries/Camel-AI Science.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Camel-AI Science.json -------------------------------------------------------------------------------- /data_summaries/Capybara.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Capybara.json -------------------------------------------------------------------------------- /data_summaries/ChatDoctor.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/ChatDoctor.json -------------------------------------------------------------------------------- /data_summaries/ChatbotArena.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/ChatbotArena.json -------------------------------------------------------------------------------- /data_summaries/Cidar.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Cidar.json -------------------------------------------------------------------------------- /data_summaries/CoT Collection.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/CoT Collection.json -------------------------------------------------------------------------------- /data_summaries/Cobra Frames.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Cobra Frames.json -------------------------------------------------------------------------------- /data_summaries/Code Alpaca.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Code Alpaca.json -------------------------------------------------------------------------------- /data_summaries/CollectiveCognition.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/CollectiveCognition.json -------------------------------------------------------------------------------- /data_summaries/CommitPackFT.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/CommitPackFT.json -------------------------------------------------------------------------------- /data_summaries/Conifer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Conifer.json -------------------------------------------------------------------------------- /data_summaries/Deita 10K.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Deita 10K.json -------------------------------------------------------------------------------- /data_summaries/DialogStudio.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/DialogStudio.json -------------------------------------------------------------------------------- /data_summaries/Dolly 15k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Dolly 15k.json -------------------------------------------------------------------------------- /data_summaries/Dynosaur.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Dynosaur.json -------------------------------------------------------------------------------- /data_summaries/EverythingLM.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/EverythingLM.json -------------------------------------------------------------------------------- /data_summaries/ExpertQA.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/ExpertQA.json -------------------------------------------------------------------------------- /data_summaries/Feedback Collection.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Feedback Collection.json -------------------------------------------------------------------------------- /data_summaries/Flan Collection (Chain-of-Thought).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Flan Collection (Chain-of-Thought).json -------------------------------------------------------------------------------- /data_summaries/Flan Collection (Dialog).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Flan Collection (Dialog).json -------------------------------------------------------------------------------- /data_summaries/Flan Collection (Flan 2021).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Flan Collection (Flan 2021).json -------------------------------------------------------------------------------- /data_summaries/Flan Collection (P3).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Flan Collection (P3).json -------------------------------------------------------------------------------- /data_summaries/Flan Collection (Super-NaturalInstructions).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Flan Collection (Super-NaturalInstructions).json -------------------------------------------------------------------------------- /data_summaries/GPT-4-Alpaca.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/GPT-4-Alpaca.json -------------------------------------------------------------------------------- /data_summaries/GPTeacher.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/GPTeacher.json -------------------------------------------------------------------------------- /data_summaries/Glaive Code Assistant v2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Glaive Code Assistant v2.json -------------------------------------------------------------------------------- /data_summaries/Glaive Code Assistant v3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Glaive Code Assistant v3.json -------------------------------------------------------------------------------- /data_summaries/Glaive Code Assistant.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Glaive Code Assistant.json -------------------------------------------------------------------------------- /data_summaries/Gorilla.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Gorilla.json -------------------------------------------------------------------------------- /data_summaries/Gretel Text-to-SQL.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Gretel Text-to-SQL.json -------------------------------------------------------------------------------- /data_summaries/HC3 (Chinese).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/HC3 (Chinese).json -------------------------------------------------------------------------------- /data_summaries/HC3 (English).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/HC3 (English).json -------------------------------------------------------------------------------- /data_summaries/HelpSteer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/HelpSteer.json -------------------------------------------------------------------------------- /data_summaries/Indic-Instruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Indic-Instruct.json -------------------------------------------------------------------------------- /data_summaries/InstAr.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/InstAr.json -------------------------------------------------------------------------------- /data_summaries/Joke Explanation.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Joke Explanation.json -------------------------------------------------------------------------------- /data_summaries/KIWI.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/KIWI.json -------------------------------------------------------------------------------- /data_summaries/LIMA.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/LIMA.json -------------------------------------------------------------------------------- /data_summaries/Llama2-MedTuned-Instructions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Llama2-MedTuned-Instructions.json -------------------------------------------------------------------------------- /data_summaries/LongAlign-10k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/LongAlign-10k.json -------------------------------------------------------------------------------- /data_summaries/Longform.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Longform.json -------------------------------------------------------------------------------- /data_summaries/Lumos Grounding.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Lumos Grounding.json -------------------------------------------------------------------------------- /data_summaries/Lumos Planning.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Lumos Planning.json -------------------------------------------------------------------------------- /data_summaries/Magpie-Pro.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Magpie-Pro.json -------------------------------------------------------------------------------- /data_summaries/MathDial.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/MathDial.json -------------------------------------------------------------------------------- /data_summaries/MathInstruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/MathInstruct.json -------------------------------------------------------------------------------- /data_summaries/MedInstruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/MedInstruct.json -------------------------------------------------------------------------------- /data_summaries/Medical Meadow.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Medical Meadow.json -------------------------------------------------------------------------------- /data_summaries/MegaWika.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/MegaWika.json -------------------------------------------------------------------------------- /data_summaries/MetaMathQA.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/MetaMathQA.json -------------------------------------------------------------------------------- /data_summaries/Nectar.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Nectar.json -------------------------------------------------------------------------------- /data_summaries/No Robots.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/No Robots.json -------------------------------------------------------------------------------- /data_summaries/NomicAI GPT4AllJ.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/NomicAI GPT4AllJ.json -------------------------------------------------------------------------------- /data_summaries/OIG.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/OIG.json -------------------------------------------------------------------------------- /data_summaries/Open Assistant OctoPack.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Open Assistant OctoPack.json -------------------------------------------------------------------------------- /data_summaries/Open Assistant v2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Open Assistant v2.json -------------------------------------------------------------------------------- /data_summaries/Open Assistant.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Open Assistant.json -------------------------------------------------------------------------------- /data_summaries/Open Orca.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Open Orca.json -------------------------------------------------------------------------------- /data_summaries/Open-Platypus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Open-Platypus.json -------------------------------------------------------------------------------- /data_summaries/OpenAI (Summarize from Feedback).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/OpenAI (Summarize from Feedback).json -------------------------------------------------------------------------------- /data_summaries/OpenAI (WebGPT).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/OpenAI (WebGPT).json -------------------------------------------------------------------------------- /data_summaries/OpenGPT Healthcare.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/OpenGPT Healthcare.json -------------------------------------------------------------------------------- /data_summaries/OpenMathInstruct-1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/OpenMathInstruct-1.json -------------------------------------------------------------------------------- /data_summaries/Orca-Math.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Orca-Math.json -------------------------------------------------------------------------------- /data_summaries/PII-Masking-200k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/PII-Masking-200k.json -------------------------------------------------------------------------------- /data_summaries/PII-masking-200k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/PII-masking-200k.json -------------------------------------------------------------------------------- /data_summaries/PMC-LLaMA Instructions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/PMC-LLaMA Instructions.json -------------------------------------------------------------------------------- /data_summaries/Preference Collection.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Preference Collection.json -------------------------------------------------------------------------------- /data_summaries/Pure-Dove.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Pure-Dove.json -------------------------------------------------------------------------------- /data_summaries/PygmalionAI-PIPPA.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/PygmalionAI-PIPPA.json -------------------------------------------------------------------------------- /data_summaries/Reasoning.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Reasoning.json -------------------------------------------------------------------------------- /data_summaries/RiddleSense.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/RiddleSense.json -------------------------------------------------------------------------------- /data_summaries/SeaBench.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/SeaBench.json -------------------------------------------------------------------------------- /data_summaries/Seacrowd.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Seacrowd.json -------------------------------------------------------------------------------- /data_summaries/SelFee.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/SelFee.json -------------------------------------------------------------------------------- /data_summaries/Self-Instruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Self-Instruct.json -------------------------------------------------------------------------------- /data_summaries/ShareGPT Vicuna.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/ShareGPT Vicuna.json -------------------------------------------------------------------------------- /data_summaries/Stack Exchange Instruction.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Stack Exchange Instruction.json -------------------------------------------------------------------------------- /data_summaries/Stanford Human Preferences.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Stanford Human Preferences.json -------------------------------------------------------------------------------- /data_summaries/StarCoder Self-Instruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/StarCoder Self-Instruct.json -------------------------------------------------------------------------------- /data_summaries/Synthetic-GSM8K-Reflection.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Synthetic-GSM8K-Reflection.json -------------------------------------------------------------------------------- /data_summaries/Tasksource Instruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Tasksource Instruct.json -------------------------------------------------------------------------------- /data_summaries/Tasksource Symbol-Tuning.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Tasksource Symbol-Tuning.json -------------------------------------------------------------------------------- /data_summaries/Thai Gen AI (Alpaca).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Thai Gen AI (Alpaca).json -------------------------------------------------------------------------------- /data_summaries/Thai Gen AI (Dolly).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Thai Gen AI (Dolly).json -------------------------------------------------------------------------------- /data_summaries/Thai Gen AI (GPTeacher).json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Thai Gen AI (GPTeacher).json -------------------------------------------------------------------------------- /data_summaries/Tiny Stories.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Tiny Stories.json -------------------------------------------------------------------------------- /data_summaries/Tool-Llama.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Tool-Llama.json -------------------------------------------------------------------------------- /data_summaries/ToxicChat.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/ToxicChat.json -------------------------------------------------------------------------------- /data_summaries/UltraChat.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/UltraChat.json -------------------------------------------------------------------------------- /data_summaries/UltraChat_200k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/UltraChat_200k.json -------------------------------------------------------------------------------- /data_summaries/UltraFeedback Argilla.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/UltraFeedback Argilla.json -------------------------------------------------------------------------------- /data_summaries/Unnatural Instructions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/Unnatural Instructions.json -------------------------------------------------------------------------------- /data_summaries/WildChat.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/WildChat.json -------------------------------------------------------------------------------- /data_summaries/WizardLM Evol-Instruct V2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/WizardLM Evol-Instruct V2.json -------------------------------------------------------------------------------- /data_summaries/WizardLM Evol-Instruct.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/WizardLM Evol-Instruct.json -------------------------------------------------------------------------------- /data_summaries/_template.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/_template.json -------------------------------------------------------------------------------- /data_summaries/_template_spec.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/_template_spec.yaml -------------------------------------------------------------------------------- /data_summaries/lmsys_chat_1m.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/lmsys_chat_1m.json -------------------------------------------------------------------------------- /data_summaries/xP3x.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/data_summaries/xP3x.json -------------------------------------------------------------------------------- /dpi-plots/video/video_creatorcategories-years.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plots/video/video_creatorcategories-years.png -------------------------------------------------------------------------------- /dpi-plots/video/video_sourcecategories-cumulativehours.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plots/video/video_sourcecategories-cumulativehours.png -------------------------------------------------------------------------------- /dpi-plots/video/video_sourcecategories-years.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plots/video/video_sourcecategories-years.png -------------------------------------------------------------------------------- /dpi-plots/video/video_sourcecategories-yearscombined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plots/video/video_sourcecategories-yearscombined.png -------------------------------------------------------------------------------- /dpi-plots/video/video_sources-licenses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plots/video/video_sources-licenses.png -------------------------------------------------------------------------------- /dpi-plots/video/video_taskcategories-years.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plots/video/video_taskcategories-years.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/creator_categories_by_modality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/creator_categories_by_modality.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/dataset_count_by_continent_and_modality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/dataset_count_by_continent_and_modality.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/dataset_count_by_country_and_modality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/dataset_count_by_country_and_modality.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/license_use_by_modality_collections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/license_use_by_modality_collections.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/multimodal-combined_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/multimodal-combined_chart.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/source_categories_by_modality-aggregated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/source_categories_by_modality-aggregated.png -------------------------------------------------------------------------------- /dpi-plotsmultimodal/source_categories_by_modality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi-plotsmultimodal/source_categories_by_modality.png -------------------------------------------------------------------------------- /dpi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/dpi.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/requirements.txt -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/analysis/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/README.md -------------------------------------------------------------------------------- /src/analysis/agents_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/agents_table.py -------------------------------------------------------------------------------- /src/analysis/aggregate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/aggregate.py -------------------------------------------------------------------------------- /src/analysis/analysis_constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/analysis_constants.py -------------------------------------------------------------------------------- /src/analysis/analysis_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/analysis_util.py -------------------------------------------------------------------------------- /src/analysis/corpus_robots_trends.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/corpus_robots_trends.ipynb -------------------------------------------------------------------------------- /src/analysis/data/agents_counter/all_agents_counter.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/agents_counter/all_agents_counter.csv -------------------------------------------------------------------------------- /src/analysis/data/multimodal_terms_data/speech.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/multimodal_terms_data/speech.csv -------------------------------------------------------------------------------- /src/analysis/data/multimodal_terms_data/text.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/multimodal_terms_data/text.csv -------------------------------------------------------------------------------- /src/analysis/data/multimodal_terms_data/video.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/multimodal_terms_data/video.csv -------------------------------------------------------------------------------- /src/analysis/data/pretrain_data/corpus_token_bucket_counts/c4_buckets.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/pretrain_data/corpus_token_bucket_counts/c4_buckets.csv -------------------------------------------------------------------------------- /src/analysis/data/pretrain_data/corpus_token_bucket_counts/dolma_buckets.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/pretrain_data/corpus_token_bucket_counts/dolma_buckets.csv -------------------------------------------------------------------------------- /src/analysis/data/pretrain_data/corpus_token_bucket_counts/rf_buckets.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/pretrain_data/corpus_token_bucket_counts/rf_buckets.csv -------------------------------------------------------------------------------- /src/analysis/data/pretrain_data/relevant_url_token_counts.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/pretrain_data/relevant_url_token_counts.csv -------------------------------------------------------------------------------- /src/analysis/data/speech_supporting_data/bloomspeech_splithours.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/speech_supporting_data/bloomspeech_splithours.csv -------------------------------------------------------------------------------- /src/analysis/data/speech_supporting_data/commonvoice_splithours.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/speech_supporting_data/commonvoice_splithours.json -------------------------------------------------------------------------------- /src/analysis/data/speech_supporting_data/fleurs_splithours.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/speech_supporting_data/fleurs_splithours.csv -------------------------------------------------------------------------------- /src/analysis/data/speech_supporting_data/languages.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/speech_supporting_data/languages.csv -------------------------------------------------------------------------------- /src/analysis/data/speech_supporting_data/multilinguallibrispeech_splithours.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/speech_supporting_data/multilinguallibrispeech_splithours.csv -------------------------------------------------------------------------------- /src/analysis/data/speech_supporting_data/yodas_splithours.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/data/speech_supporting_data/yodas_splithours.csv -------------------------------------------------------------------------------- /src/analysis/market_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/market_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/multimodal_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/multimodal_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/multimodal_data_aggregator.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/multimodal_data_aggregator.ipynb -------------------------------------------------------------------------------- /src/analysis/multimodal_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/multimodal_util.py -------------------------------------------------------------------------------- /src/analysis/paywall_domain_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/paywall_domain_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/prompt_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/prompt_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/prompt_domain_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/prompt_domain_analysis.py -------------------------------------------------------------------------------- /src/analysis/robots_analysis-tables-confusion-matrices-will.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/robots_analysis-tables-confusion-matrices-will.ipynb -------------------------------------------------------------------------------- /src/analysis/robots_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/robots_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/robots_analysis_p2.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/robots_analysis_p2.ipynb -------------------------------------------------------------------------------- /src/analysis/speech_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/speech_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/text_ft_plots.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/text_ft_plots.ipynb -------------------------------------------------------------------------------- /src/analysis/video_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/video_analysis.ipynb -------------------------------------------------------------------------------- /src/analysis/visualization_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/analysis/visualization_util.py -------------------------------------------------------------------------------- /src/collection_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/collection_mapper.py -------------------------------------------------------------------------------- /src/configs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/README.md -------------------------------------------------------------------------------- /src/configs/commercial_licenses.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/commercial_licenses.yaml -------------------------------------------------------------------------------- /src/configs/commercial_licenses_and_terms.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/commercial_licenses_and_terms.yaml -------------------------------------------------------------------------------- /src/configs/commercial_or_unspecified_licenses.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/commercial_or_unspecified_licenses.yaml -------------------------------------------------------------------------------- /src/configs/commercial_or_unspecified_licenses_and_terms.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/commercial_or_unspecified_licenses_and_terms.yaml -------------------------------------------------------------------------------- /src/configs/common_pile_datasets.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/common_pile_datasets.txt -------------------------------------------------------------------------------- /src/configs/common_pile_ultra_permissive.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/common_pile_ultra_permissive.yaml -------------------------------------------------------------------------------- /src/configs/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/configs/default.yaml -------------------------------------------------------------------------------- /src/data_bibtex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/data_bibtex.py -------------------------------------------------------------------------------- /src/data_provenance_card.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/data_provenance_card.py -------------------------------------------------------------------------------- /src/download_and_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/download_and_filter.py -------------------------------------------------------------------------------- /src/downloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/downloader.py -------------------------------------------------------------------------------- /src/downloaders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/downloaders.py -------------------------------------------------------------------------------- /src/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/helpers/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/helpers/constants.py -------------------------------------------------------------------------------- /src/helpers/filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/helpers/filters.py -------------------------------------------------------------------------------- /src/helpers/io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/helpers/io.py -------------------------------------------------------------------------------- /src/preparers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/preparers.py -------------------------------------------------------------------------------- /src/scripts/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/scripts/README.md -------------------------------------------------------------------------------- /src/scripts/annotate_text_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/scripts/annotate_text_stats.py -------------------------------------------------------------------------------- /src/scripts/infer_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/scripts/infer_metadata.py -------------------------------------------------------------------------------- /src/summary-tables/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/README.md -------------------------------------------------------------------------------- /src/summary-tables/collections-audio.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/collections-audio.ipynb -------------------------------------------------------------------------------- /src/summary-tables/collections-text-v1paper.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/collections-text-v1paper.ipynb -------------------------------------------------------------------------------- /src/summary-tables/collections-text.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/collections-text.ipynb -------------------------------------------------------------------------------- /src/summary-tables/collections-video.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/collections-video.ipynb -------------------------------------------------------------------------------- /src/summary-tables/emoji/CommercialDataCircle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/CommercialDataCircle.pdf -------------------------------------------------------------------------------- /src/summary-tables/emoji/NCDataCircle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/NCDataCircle.pdf -------------------------------------------------------------------------------- /src/summary-tables/emoji/UnspecifiedDataCircle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/UnspecifiedDataCircle.pdf -------------------------------------------------------------------------------- /src/summary-tables/emoji/globe-with-meridians.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/globe-with-meridians.pdf -------------------------------------------------------------------------------- /src/summary-tables/emoji/greencheck.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/greencheck.pdf -------------------------------------------------------------------------------- /src/summary-tables/emoji/redcross.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/redcross.pdf -------------------------------------------------------------------------------- /src/summary-tables/emoji/robot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/emoji/robot.pdf -------------------------------------------------------------------------------- /src/summary-tables/hf_downloads.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/hf_downloads.csv -------------------------------------------------------------------------------- /src/summary-tables/papers.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/papers.csv -------------------------------------------------------------------------------- /src/summary-tables/refs-licenses.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/refs-licenses.ipynb -------------------------------------------------------------------------------- /src/summary-tables/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/summary-tables/utils.py -------------------------------------------------------------------------------- /src/test_new_collection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/test_new_collection.py -------------------------------------------------------------------------------- /src/web_analysis/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/README.md -------------------------------------------------------------------------------- /src/web_analysis/data/IP2LOCATION-LITE-DB1.IPV6.BIN: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/data/IP2LOCATION-LITE-DB1.IPV6.BIN -------------------------------------------------------------------------------- /src/web_analysis/data/_top_2000_c4_token_and_urlcounts - top_2000_c4_token_and_urlcounts.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/data/_top_2000_c4_token_and_urlcounts - top_2000_c4_token_and_urlcounts.csv -------------------------------------------------------------------------------- /src/web_analysis/data/failed_responses.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /src/web_analysis/data/gpt-response-cache.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/data/gpt-response-cache.json -------------------------------------------------------------------------------- /src/web_analysis/data/gpt-response-failed.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/data/gpt-response-failed.json -------------------------------------------------------------------------------- /src/web_analysis/data/prompt_templates.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/data/prompt_templates.json -------------------------------------------------------------------------------- /src/web_analysis/downloading_web/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/downloading_web/README.md -------------------------------------------------------------------------------- /src/web_analysis/downloading_web/c4_hf_streaming.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/downloading_web/c4_hf_streaming.py -------------------------------------------------------------------------------- /src/web_analysis/downloading_web/dolma_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/downloading_web/dolma_download.py -------------------------------------------------------------------------------- /src/web_analysis/downloading_web/refinedweb_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/downloading_web/refinedweb_download.py -------------------------------------------------------------------------------- /src/web_analysis/extract_robots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/extract_robots.py -------------------------------------------------------------------------------- /src/web_analysis/forecasting_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/forecasting_util.py -------------------------------------------------------------------------------- /src/web_analysis/gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/gpt.py -------------------------------------------------------------------------------- /src/web_analysis/gpt_tos_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/gpt_tos_analysis.py -------------------------------------------------------------------------------- /src/web_analysis/parse_robots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/parse_robots.py -------------------------------------------------------------------------------- /src/web_analysis/requirements_gpt_tos_analysis.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/requirements_gpt_tos_analysis.txt -------------------------------------------------------------------------------- /src/web_analysis/requirements_website_geolocation.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | socket 3 | IP2Location 4 | tqdm -------------------------------------------------------------------------------- /src/web_analysis/robots_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/robots_util.py -------------------------------------------------------------------------------- /src/web_analysis/test_robots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/test_robots.py -------------------------------------------------------------------------------- /src/web_analysis/wayback_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/web_analysis/wayback_extraction/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/wayback_extraction/file_utils.py -------------------------------------------------------------------------------- /src/web_analysis/wayback_extraction/requirements_wayback.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/wayback_extraction/requirements_wayback.txt -------------------------------------------------------------------------------- /src/web_analysis/wayback_extraction/temporal_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/wayback_extraction/temporal_pipeline.py -------------------------------------------------------------------------------- /src/web_analysis/wayback_extraction/wayback_cdx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/wayback_extraction/wayback_cdx.py -------------------------------------------------------------------------------- /src/web_analysis/website_geolocation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Provenance-Initiative/Data-Provenance-Collection/HEAD/src/web_analysis/website_geolocation.py --------------------------------------------------------------------------------