├── .gitattributes ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── actions │ └── test-template │ │ └── action.yml ├── copilot-instructions.md ├── copy-pr-bot.yaml └── workflows │ ├── build-docs.yml │ ├── build-test-publish-wheel.yml │ ├── cherry-pick-release-commit.yml │ ├── cicd-main.yml │ ├── close-inactive-issue-pr.yml │ ├── code-linting.yml │ ├── community-bot.yml │ ├── release-freeze.yml │ ├── release.yml │ └── ruff.yml ├── .gitignore ├── .gitmodules ├── .markdownlint.json ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── SECURITY.md ├── api-design.md ├── benchmarking ├── Dockerfile ├── README.md ├── __init__.py ├── config.yaml ├── dummy-config.yaml ├── nightly-benchmark.yaml ├── run.py ├── runner │ ├── __init__.py │ ├── datasets.py │ ├── entry.py │ ├── env_capture.py │ ├── path_resolver.py │ ├── process.py │ ├── ray_cluster.py │ ├── session.py │ ├── sinks │ │ ├── __init__.py │ │ ├── gdrive_sink.py │ │ ├── mlflow_sink.py │ │ ├── sink.py │ │ └── slack_sink.py │ └── utils.py ├── scripts │ ├── __init__.py │ ├── common_crawl_benchmark.py │ ├── dedup_removal_benchmark.py │ ├── domain_classification_benchmark.py │ ├── dummy_benchmark.py │ ├── embedding_generation_benchmark.py │ └── fuzzy_dedup_identification_benchmark.py └── tools │ ├── __init__.py │ ├── build_docker.sh │ ├── gen_runscript_vars.py │ └── run.sh ├── codecov.yml ├── docker ├── Dockerfile └── common │ └── install_ffmpeg.sh ├── docs ├── README.md ├── __init__.py ├── _extensions │ ├── __init__.py │ ├── ai_assistant │ │ ├── README.md │ │ ├── __init__.py │ │ ├── assets │ │ │ └── styles │ │ │ │ └── ai-assistant.css │ │ ├── core │ │ │ ├── AIClient.js │ │ │ ├── ResponseProcessor.js │ │ │ └── main.js │ │ ├── integrations │ │ │ └── search-integration.js │ │ └── ui │ │ │ ├── MarkdownProcessor.js │ │ │ └── ResponseRenderer.js │ ├── content_gating │ │ ├── README.md │ │ ├── __init__.py │ │ ├── condition_evaluator.py │ │ ├── conditional_directives.py │ │ └── document_filter.py │ ├── json_output │ │ ├── README.md │ │ ├── __init__.py │ │ ├── config.py │ │ ├── content │ │ │ ├── __init__.py │ │ │ ├── extractor.py │ │ │ ├── metadata.py │ │ │ ├── structured.py │ │ │ └── text.py │ │ ├── core │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── document_discovery.py │ │ │ ├── hierarchy_builder.py │ │ │ ├── json_formatter.py │ │ │ └── json_writer.py │ │ ├── processing │ │ │ ├── __init__.py │ │ │ ├── cache.py │ │ │ └── processor.py │ │ └── utils.py │ ├── myst_codeblock_substitutions.py │ ├── rich_metadata │ │ ├── __init__.py │ │ ├── templates │ │ │ └── layout.html │ │ └── verify_metadata.py │ └── search_assets │ │ ├── __init__.py │ │ ├── enhanced-search.css │ │ ├── main.js │ │ ├── modules │ │ ├── DocumentLoader.js │ │ ├── EventHandler.js │ │ ├── ResultRenderer.js │ │ ├── SearchEngine.js │ │ ├── SearchInterface.js │ │ ├── SearchPageManager.js │ │ └── Utils.js │ │ └── templates │ │ └── search.html ├── _images │ ├── ablation.png │ ├── scaling.png │ └── text-benchmarks.png ├── _templates │ └── autodoc2_index.rst ├── about │ ├── concepts │ │ ├── audio │ │ │ ├── asr-pipeline.md │ │ │ ├── audio-batch.md │ │ │ ├── curation-pipeline.md │ │ │ ├── index.md │ │ │ ├── manifests-ingest.md │ │ │ ├── quality-metrics.md │ │ │ └── text-integration.md │ │ ├── deduplication.md │ │ ├── image │ │ │ ├── data-export-concepts.md │ │ │ ├── data-loading-concepts.md │ │ │ ├── data-processing-concepts.md │ │ │ └── index.md │ │ ├── index.md │ │ ├── text │ │ │ ├── _images │ │ │ │ └── text-processing-diagram.png │ │ │ ├── data-acquisition-concepts.md │ │ │ ├── data-curation-pipeline.md │ │ │ ├── data-loading-concepts.md │ │ │ ├── data-processing-concepts.md │ │ │ └── index.md │ │ └── video │ │ │ ├── _images │ │ │ ├── stages-pipelines-diagram.png │ │ │ └── video-pipeline-diagram.png │ │ │ ├── abstractions.md │ │ │ ├── architecture.md │ │ │ ├── data-flow.md │ │ │ └── index.md │ ├── index.md │ ├── key-features.md │ └── release-notes │ │ ├── index.md │ │ ├── migration-faq.md │ │ └── migration-guide.md ├── admin │ ├── deployment │ │ ├── index.md │ │ ├── requirements.md │ │ └── slurm │ │ │ └── image.md │ ├── index.md │ ├── installation.md │ └── integrations │ │ └── index.md ├── broken_links_needing_review.json ├── conf.py ├── curate-audio │ ├── index.md │ ├── load-data │ │ ├── custom-manifests.md │ │ ├── fleurs-dataset.md │ │ ├── index.md │ │ └── local-files.md │ ├── process-data │ │ ├── asr-inference │ │ │ ├── index.md │ │ │ └── nemo-models.md │ │ ├── audio-analysis │ │ │ ├── duration-calculation.md │ │ │ ├── format-validation.md │ │ │ └── index.md │ │ ├── index.md │ │ ├── quality-assessment │ │ │ ├── duration-filtering.md │ │ │ ├── index.md │ │ │ └── wer-filtering.md │ │ └── text-integration │ │ │ └── index.md │ ├── save-export.md │ └── tutorials │ │ ├── beginner.md │ │ └── index.md ├── curate-images │ ├── index.md │ ├── load-data │ │ ├── index.md │ │ └── tar-archives.md │ ├── process-data │ │ ├── embeddings │ │ │ ├── clip-embedder.md │ │ │ └── index.md │ │ ├── filters │ │ │ ├── aesthetic.md │ │ │ ├── index.md │ │ │ └── nsfw.md │ │ └── index.md │ ├── save-export.md │ └── tutorials │ │ ├── beginner.md │ │ ├── dedup-workflow.md │ │ └── index.md ├── curate-text │ ├── index.md │ ├── load-data │ │ ├── arxiv.md │ │ ├── common-crawl.md │ │ ├── custom.md │ │ ├── index.md │ │ ├── read-existing.md │ │ └── wikipedia.md │ ├── process-data │ │ ├── content-processing │ │ │ ├── add-id.md │ │ │ ├── index.md │ │ │ └── text-cleaning.md │ │ ├── deduplication │ │ │ ├── exact.md │ │ │ ├── fuzzy.md │ │ │ ├── index.md │ │ │ └── semdedup.md │ │ ├── index.md │ │ ├── language-management │ │ │ ├── index.md │ │ │ ├── language.md │ │ │ └── stopwords.md │ │ ├── quality-assessment │ │ │ ├── classifier.md │ │ │ ├── distributed-classifier.md │ │ │ ├── heuristic.md │ │ │ └── index.md │ │ └── specialized-processing │ │ │ ├── code.md │ │ │ └── index.md │ └── tutorials │ │ └── index.md ├── curate-video │ ├── index.md │ ├── load-data │ │ └── index.md │ ├── process-data │ │ ├── captions-preview.md │ │ ├── clipping.md │ │ ├── dedup.md │ │ ├── embeddings.md │ │ ├── filtering.md │ │ ├── frame-extraction.md │ │ ├── index.md │ │ └── transcoding.md │ ├── save-export.md │ └── tutorials │ │ ├── _images │ │ └── dedup-plot.png │ │ ├── beginner.md │ │ ├── index.md │ │ ├── pipeline-customization │ │ ├── add-cust-code.md │ │ ├── add-cust-env.md │ │ ├── add-cust-model.md │ │ ├── add-cust-stage.md │ │ └── index.md │ │ └── split-dedup.md ├── get-started │ ├── audio.md │ ├── image.md │ ├── index.md │ ├── text.md │ └── video.md ├── index.md ├── project.json ├── reference │ ├── index.md │ ├── infrastructure │ │ ├── container-environments.md │ │ ├── execution-backends.md │ │ ├── gpu-processing.md │ │ ├── index.md │ │ ├── memory-management.md │ │ └── resumable-processing.md │ └── related-tools.md └── versions1.json ├── external ├── intern_video2_installation.sh └── intern_video2_multimodal.patch ├── nemo_curator ├── __init__.py ├── backends │ ├── __init__.py │ ├── base.py │ ├── experimental │ │ ├── __init__.py │ │ ├── ray_actor_pool │ │ │ ├── __init__.py │ │ │ ├── adapter.py │ │ │ ├── executor.py │ │ │ ├── raft_adapter.py │ │ │ ├── shuffle_adapter.py │ │ │ └── utils.py │ │ ├── ray_data │ │ │ ├── __init__.py │ │ │ ├── adapter.py │ │ │ ├── executor.py │ │ │ └── utils.py │ │ └── utils.py │ ├── internal │ │ ├── __init__.py │ │ └── raft │ │ │ ├── __init__.py │ │ │ └── ray_comms.py │ ├── utils.py │ └── xenna │ │ ├── __init__.py │ │ ├── adapter.py │ │ └── executor.py ├── config.py ├── core │ ├── __init__.py │ ├── client.py │ ├── constants.py │ └── utils.py ├── metrics │ ├── README.md │ ├── __init__.py │ ├── constants.py │ ├── start_prometheus_grafana.py │ ├── utils.py │ └── xenna_grafana_dashboard.json ├── models │ ├── __init__.py │ ├── aesthetics.py │ ├── base.py │ ├── client │ │ ├── __init__.py │ │ ├── llm_client.py │ │ └── openai_client.py │ ├── clip.py │ ├── cosmos_embed1.py │ ├── internvideo2_mm.py │ ├── nsfw.py │ ├── prompt_formatter.py │ ├── qwen_lm.py │ ├── qwen_vl.py │ └── transnetv2.py ├── package_info.py ├── pipeline │ ├── __init__.py │ └── pipeline.py ├── stages │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ ├── common.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── file_utils.py │ │ │ └── fleurs │ │ │ │ ├── __init__.py │ │ │ │ └── create_initial_manifest.py │ │ ├── inference │ │ │ ├── __init__.py │ │ │ └── asr_nemo.py │ │ ├── io │ │ │ ├── __init__.py │ │ │ └── convert.py │ │ └── metrics │ │ │ ├── __init__.py │ │ │ └── get_wer.py │ ├── base.py │ ├── client_partitioning.py │ ├── deduplication │ │ ├── __init__.py │ │ ├── exact │ │ │ ├── __init__.py │ │ │ ├── identification.py │ │ │ └── workflow.py │ │ ├── fuzzy │ │ │ ├── __init__.py │ │ │ ├── buckets_to_edges.py │ │ │ ├── connected_components.py │ │ │ ├── identify_duplicates.py │ │ │ ├── lsh │ │ │ │ ├── __init__.py │ │ │ │ ├── lsh.py │ │ │ │ └── stage.py │ │ │ ├── minhash.py │ │ │ ├── utils.py │ │ │ └── workflow.py │ │ ├── gpu_utils.py │ │ ├── id_generator.py │ │ ├── io_utils.py │ │ ├── semantic │ │ │ ├── __init__.py │ │ │ ├── identify_duplicates.py │ │ │ ├── kmeans.py │ │ │ ├── pairwise.py │ │ │ ├── pairwise_io.py │ │ │ ├── ranking.py │ │ │ ├── utils.py │ │ │ └── workflow.py │ │ └── shuffle_utils │ │ │ ├── __init__.py │ │ │ ├── rapidsmpf_shuffler.py │ │ │ └── stage.py │ ├── file_partitioning.py │ ├── function_decorators.py │ ├── image │ │ ├── __init__.py │ │ ├── deduplication │ │ │ ├── __init__.py │ │ │ └── removal.py │ │ ├── embedders │ │ │ ├── __init__.py │ │ │ └── clip_embedder.py │ │ ├── filters │ │ │ ├── __init__.py │ │ │ ├── aesthetic_filter.py │ │ │ ├── base.py │ │ │ └── nsfw_filter.py │ │ └── io │ │ │ ├── __init__.py │ │ │ ├── convert.py │ │ │ ├── image_reader.py │ │ │ └── image_writer.py │ ├── resources.py │ ├── synthetic │ │ ├── __init__.py │ │ ├── nemotron_cc │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── nemotron_cc.py │ │ │ └── prompts.py │ │ └── qa_multilingual_synthetic.py │ ├── text │ │ ├── __init__.py │ │ ├── classifiers │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── aegis.py │ │ │ ├── aegis_utils.py │ │ │ ├── base.py │ │ │ ├── constants.py │ │ │ ├── content_type.py │ │ │ ├── domain.py │ │ │ ├── fineweb_edu.py │ │ │ ├── prompt_task_complexity.py │ │ │ └── quality.py │ │ ├── deduplication │ │ │ ├── __init__.py │ │ │ ├── removal.py │ │ │ ├── removal_workflow.py │ │ │ └── semantic.py │ │ ├── download │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── arxiv │ │ │ │ ├── __init__.py │ │ │ │ ├── download.py │ │ │ │ ├── extract.py │ │ │ │ ├── iterator.py │ │ │ │ ├── stage.py │ │ │ │ └── url_generation.py │ │ │ ├── base │ │ │ │ ├── __init__.py │ │ │ │ ├── download.py │ │ │ │ ├── extract.py │ │ │ │ ├── iterator.py │ │ │ │ ├── stage.py │ │ │ │ └── url_generation.py │ │ │ ├── common_crawl │ │ │ │ ├── __init__.py │ │ │ │ ├── download.py │ │ │ │ ├── extract.py │ │ │ │ ├── stage.py │ │ │ │ ├── url_generation.py │ │ │ │ └── warc_iterator.py │ │ │ ├── html_extractors │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── justext.py │ │ │ │ ├── resiliparse.py │ │ │ │ ├── trafilatura.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ja_stopwords.py │ │ │ │ │ ├── th_stopwords.py │ │ │ │ │ └── zh_stopwords.py │ │ │ ├── utils.py │ │ │ └── wikipedia │ │ │ │ ├── __init__.py │ │ │ │ ├── download.py │ │ │ │ ├── extract.py │ │ │ │ ├── iterator.py │ │ │ │ ├── stage.py │ │ │ │ └── url_generation.py │ │ ├── embedders │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── utils.py │ │ ├── filters │ │ │ ├── __init__.py │ │ │ ├── code.py │ │ │ ├── doc_filter.py │ │ │ ├── fasttext_filter.py │ │ │ └── heuristic_filter.py │ │ ├── io │ │ │ ├── __init__.py │ │ │ ├── reader │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── jsonl.py │ │ │ │ └── parquet.py │ │ │ └── writer │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── jsonl.py │ │ │ │ ├── parquet.py │ │ │ │ └── utils.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ ├── tokenizer.py │ │ │ └── utils.py │ │ ├── modifiers │ │ │ ├── __init__.py │ │ │ ├── c4.py │ │ │ ├── doc_modifier.py │ │ │ ├── fasttext.py │ │ │ ├── line_remover.py │ │ │ ├── markdown_remover.py │ │ │ ├── newline_normalizer.py │ │ │ ├── quotation_remover.py │ │ │ ├── slicer.py │ │ │ ├── unicode_reformatter.py │ │ │ └── url_remover.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── add_id.py │ │ │ ├── joiner.py │ │ │ ├── modifier.py │ │ │ ├── score_filter.py │ │ │ └── splitter.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ └── text_utils.py │ └── video │ │ ├── __init__.py │ │ ├── caption │ │ ├── __init__.py │ │ ├── caption_enhancement.py │ │ ├── caption_generation.py │ │ └── caption_preparation.py │ │ ├── clipping │ │ ├── __init__.py │ │ ├── clip_extraction_stages.py │ │ ├── clip_frame_extraction.py │ │ ├── transnetv2_extraction.py │ │ └── video_frame_extraction.py │ │ ├── embedding │ │ ├── __init__.py │ │ ├── cosmos_embed1.py │ │ └── internvideo2.py │ │ ├── filtering │ │ ├── __init__.py │ │ ├── clip_aesthetic_filter.py │ │ ├── motion_filter.py │ │ └── motion_vector_backend.py │ │ ├── io │ │ ├── __init__.py │ │ ├── clip_writer.py │ │ └── video_reader.py │ │ └── preview │ │ ├── __init__.py │ │ └── preview.py ├── tasks │ ├── __init__.py │ ├── audio_batch.py │ ├── document.py │ ├── file_group.py │ ├── image.py │ ├── tasks.py │ ├── utils.py │ └── video.py └── utils │ ├── __init__.py │ ├── client_utils.py │ ├── code_meta.csv │ ├── column_utils.py │ ├── decoder_utils.py │ ├── file_utils.py │ ├── grouping.py │ ├── hf_download_utils.py │ ├── nvcodec_utils.py │ ├── operation_utils.py │ ├── performance_utils.py │ ├── split_large_files.py │ ├── storage_utils.py │ ├── windowing_utils.py │ └── writer_utils.py ├── pyproject.toml ├── requirements-docs.txt ├── tests ├── L0_Unit_Test_GPU.sh ├── __init__.py ├── backends │ ├── __init__.py │ ├── experimental │ │ ├── __init__.py │ │ ├── ray_actor_pool │ │ │ ├── __init__.py │ │ │ └── test_executor.py │ │ ├── ray_data │ │ │ ├── __init__.py │ │ │ └── test_utils.py │ │ └── test_utils.py │ ├── test_integration.py │ ├── test_utils.py │ └── utils.py ├── conftest.py ├── core │ ├── __init__.py │ └── test_get_ray_client.py ├── data │ └── audio │ │ └── armenian │ │ └── fleurs │ │ ├── dev.tar.gz │ │ ├── dev.tsv │ │ └── test_data_reference.json ├── models │ ├── __init__.py │ ├── client │ │ ├── __init__.py │ │ ├── test_llm_client.py │ │ └── test_openai_client.py │ ├── test_aesthetics.py │ ├── test_clip.py │ ├── test_cosmos_embed1.py │ ├── test_internvideo2.py │ ├── test_internvideo2_mm.py │ ├── test_prompt_formatter.py │ ├── test_qwen_lm.py │ ├── test_qwen_vl.py │ └── test_transnetv2.py ├── pipelines │ ├── __init__.py │ └── test_pipelines.py ├── stages │ ├── audio │ │ ├── __init__.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ └── test_fleurs_create_initial_manifest.py │ │ ├── inference │ │ │ ├── __init__.py │ │ │ └── test_asr_nemo.py │ │ ├── io │ │ │ ├── __init__.py │ │ │ └── test_convert.py │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ └── test_get_wer.py │ │ └── test_common.py │ ├── common │ │ ├── __init__.py │ │ ├── test_base.py │ │ ├── test_client_partitioning.py │ │ ├── test_file_partitioning.py │ │ └── test_function_decorators.py │ ├── deduplication │ │ ├── __init__.py │ │ ├── exact │ │ │ ├── __init__.py │ │ │ ├── test_identification.py │ │ │ └── test_workflow.py │ │ ├── fuzzy │ │ │ ├── __init__.py │ │ │ ├── test_buckets_to_edges_stage.py │ │ │ ├── test_connected_components_stage.py │ │ │ ├── test_fuzzy_workflow.py │ │ │ ├── test_lsh_stage.py │ │ │ ├── test_minhash.py │ │ │ └── test_minhash_stage.py │ │ ├── semantic │ │ │ ├── __init__.py │ │ │ ├── test_identify_duplicates.py │ │ │ ├── test_kmeans.py │ │ │ ├── test_pairwise.py │ │ │ ├── test_pairwise_io.py │ │ │ ├── test_ranking.py │ │ │ ├── test_utils.py │ │ │ └── test_workflow.py │ │ ├── shuffle_utils │ │ │ ├── __init__.py │ │ │ └── test_shuffle_stage.py │ │ └── test_id_generator.py │ ├── image │ │ ├── __init__.py │ │ ├── dedup │ │ │ ├── __init__.py │ │ │ └── test_dedup_filter.py │ │ ├── embedders │ │ │ ├── __init__.py │ │ │ └── test_clip_embedder.py │ │ ├── filters │ │ │ ├── __init__.py │ │ │ ├── test_aesthetic_filter.py │ │ │ └── test_nsfw_filter.py │ │ └── io │ │ │ ├── __init__.py │ │ │ ├── test_convert.py │ │ │ ├── test_image_reader.py │ │ │ └── test_image_writer.py │ ├── synthetic │ │ ├── __init__.py │ │ ├── nemotron_cc │ │ │ ├── __init__.py │ │ │ ├── test_base.py │ │ │ └── test_nemotron_cc.py │ │ └── test_qa_multilingual_synthetic.py │ ├── text │ │ ├── __init__.py │ │ ├── classifiers │ │ │ ├── __init__.py │ │ │ └── test_classifiers.py │ │ ├── deduplication │ │ │ ├── __init__.py │ │ │ ├── test_removal.py │ │ │ ├── test_removal_workflow.py │ │ │ └── test_semantic.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ ├── arxiv │ │ │ │ ├── __init__.py │ │ │ │ ├── test_download.py │ │ │ │ ├── test_extract.py │ │ │ │ ├── test_iterator.py │ │ │ │ ├── test_stage.py │ │ │ │ └── test_url_generation.py │ │ │ ├── base │ │ │ │ ├── __init__.py │ │ │ │ ├── test_download.py │ │ │ │ ├── test_extract.py │ │ │ │ ├── test_iterator.py │ │ │ │ ├── test_stage.py │ │ │ │ └── test_url_generation.py │ │ │ ├── common_crawl │ │ │ │ ├── __init__.py │ │ │ │ ├── test_download.py │ │ │ │ ├── test_extract.py │ │ │ │ ├── test_stage.py │ │ │ │ ├── test_url_generation.py │ │ │ │ └── test_warc_iterator.py │ │ │ ├── test_html_extractors_implementation.py │ │ │ └── wikipedia │ │ │ │ ├── __init__.py │ │ │ │ ├── test_download.py │ │ │ │ ├── test_extract.py │ │ │ │ ├── test_iterator.py │ │ │ │ ├── test_stage.py │ │ │ │ └── test_url_generation.py │ │ ├── embedders │ │ │ ├── __init__.py │ │ │ ├── test_base.py │ │ │ └── test_utils.py │ │ ├── io │ │ │ ├── __init__.py │ │ │ ├── reader │ │ │ │ ├── __init__.py │ │ │ │ ├── test_integration.py │ │ │ │ ├── test_jsonl.py │ │ │ │ └── test_parquet.py │ │ │ └── writer │ │ │ │ ├── __init__.py │ │ │ │ ├── conftest.py │ │ │ │ ├── test_jsonl.py │ │ │ │ ├── test_parquet.py │ │ │ │ └── test_utils.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── test_model.py │ │ │ ├── test_tokenizer.py │ │ │ └── test_utils.py │ │ └── modules │ │ │ ├── __init__.py │ │ │ ├── test_add_id.py │ │ │ ├── test_filters.py │ │ │ ├── test_joiner.py │ │ │ ├── test_modifiers.py │ │ │ └── test_splitter.py │ └── video │ │ ├── __init__.py │ │ ├── caption │ │ ├── __init__.py │ │ ├── test_caption_enhancement.py │ │ ├── test_caption_generation.py │ │ └── test_caption_preparation.py │ │ ├── clipping │ │ ├── __init__.py │ │ ├── test_clip_frame_extraction.py │ │ ├── test_clip_transcoding_stage.py │ │ ├── test_fixed_stride_extractor_stage.py │ │ ├── test_transnetv2_extraction.py │ │ └── test_video_frame_extraction.py │ │ ├── embedding │ │ ├── __init__.py │ │ └── test_cosmos_embed1.py │ │ ├── filtering │ │ ├── __init__.py │ │ ├── test_clip_aesthetic_filter.py │ │ ├── test_motion_filter.py │ │ └── test_motion_vector_backend.py │ │ ├── io │ │ ├── __init__.py │ │ ├── test_clip_writer.py │ │ └── test_video_reader.py │ │ └── preview │ │ ├── __init__.py │ │ └── test_preview.py ├── tasks │ ├── __init__.py │ ├── test_audio_batch.py │ ├── test_tasks.py │ └── test_video.py ├── test___init__.py ├── test_cudf_placeholder.py └── utils │ ├── __init__.py │ ├── test_client_utils.py │ ├── test_column_utils.py │ ├── test_decoder_utils.py │ ├── test_file_utils.py │ ├── test_grouping.py │ ├── test_nvcodec_utils.py │ ├── test_operation_utils.py │ ├── test_split_large_files.py │ └── test_writer_utils.py ├── tutorials ├── README.md ├── audio │ ├── README.md │ └── fleurs │ │ ├── README.md │ │ ├── pipeline.py │ │ ├── pipeline.yaml │ │ └── run.py ├── image │ ├── README.md │ └── getting-started │ │ ├── README.md │ │ ├── helper.py │ │ ├── image_curation_example.py │ │ └── image_dedup_example.py ├── quickstart.py ├── synthetic │ ├── README.md │ ├── nemotron_cc │ │ ├── example_data │ │ │ └── data.parquet │ │ ├── nemotron_cc_pipelines.py │ │ ├── nemotron_cc_sdg_high_quality_example_pipeline.py │ │ └── nemotron_cc_sdg_low_quality_example_pipeline.py │ └── synthetic_data_generation_example.py ├── text │ ├── README.md │ ├── deduplication │ │ └── semantic │ │ │ ├── semantic_e2e.ipynb │ │ │ └── semantic_step_by_step.ipynb │ ├── distributed-data-classification │ │ ├── README.md │ │ ├── aegis-classification.ipynb │ │ ├── content-type-classification.ipynb │ │ ├── domain-classification.ipynb │ │ ├── fineweb-edu-classification.ipynb │ │ ├── fineweb-mixtral-edu-classification.ipynb │ │ ├── fineweb-nemotron-edu-classification.ipynb │ │ ├── instruction-data-guard-classification.ipynb │ │ ├── multilingual-domain-classification.ipynb │ │ ├── prompt-task-complexity-classification.ipynb │ │ └── quality-classification.ipynb │ ├── download-and-extract │ │ ├── README.md │ │ └── download_extract_tutorial.ipynb │ ├── gliner-pii-redaction │ │ ├── README.md │ │ ├── gliner_pii_redaction.ipynb │ │ └── gliner_pii_redactor.py │ ├── llama-nemotron-data-curation │ │ ├── README.md │ │ ├── filters │ │ │ ├── heuristic_filters.py │ │ │ └── model_filters.py │ │ ├── main.py │ │ └── utils │ │ │ └── jsonl_utils.py │ ├── peft-curation │ │ ├── README.md │ │ ├── main.py │ │ └── stages.py │ └── tinystories │ │ ├── README.md │ │ ├── main.py │ │ └── stages.py └── video │ ├── README.md │ └── getting-started │ ├── README.md │ └── video_split_clip_example.py └── uv.lock /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.gitattributes -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /.github/actions/test-template/action.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/actions/test-template/action.yml -------------------------------------------------------------------------------- /.github/copilot-instructions.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/copilot-instructions.md -------------------------------------------------------------------------------- /.github/copy-pr-bot.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/copy-pr-bot.yaml -------------------------------------------------------------------------------- /.github/workflows/build-docs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/build-docs.yml -------------------------------------------------------------------------------- /.github/workflows/build-test-publish-wheel.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/build-test-publish-wheel.yml -------------------------------------------------------------------------------- /.github/workflows/cherry-pick-release-commit.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/cherry-pick-release-commit.yml -------------------------------------------------------------------------------- /.github/workflows/cicd-main.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/cicd-main.yml -------------------------------------------------------------------------------- /.github/workflows/close-inactive-issue-pr.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/close-inactive-issue-pr.yml -------------------------------------------------------------------------------- /.github/workflows/code-linting.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/code-linting.yml -------------------------------------------------------------------------------- /.github/workflows/community-bot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/community-bot.yml -------------------------------------------------------------------------------- /.github/workflows/release-freeze.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/release-freeze.yml -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/release.yml -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.github/workflows/ruff.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.markdownlint.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.markdownlint.json -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/CITATION.cff -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/SECURITY.md -------------------------------------------------------------------------------- /api-design.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/api-design.md -------------------------------------------------------------------------------- /benchmarking/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/Dockerfile -------------------------------------------------------------------------------- /benchmarking/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/README.md -------------------------------------------------------------------------------- /benchmarking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarking/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/config.yaml -------------------------------------------------------------------------------- /benchmarking/dummy-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/dummy-config.yaml -------------------------------------------------------------------------------- /benchmarking/nightly-benchmark.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/nightly-benchmark.yaml -------------------------------------------------------------------------------- /benchmarking/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/run.py -------------------------------------------------------------------------------- /benchmarking/runner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarking/runner/datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/datasets.py -------------------------------------------------------------------------------- /benchmarking/runner/entry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/entry.py -------------------------------------------------------------------------------- /benchmarking/runner/env_capture.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/env_capture.py -------------------------------------------------------------------------------- /benchmarking/runner/path_resolver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/path_resolver.py -------------------------------------------------------------------------------- /benchmarking/runner/process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/process.py -------------------------------------------------------------------------------- /benchmarking/runner/ray_cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/ray_cluster.py -------------------------------------------------------------------------------- /benchmarking/runner/session.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/session.py -------------------------------------------------------------------------------- /benchmarking/runner/sinks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarking/runner/sinks/gdrive_sink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/sinks/gdrive_sink.py -------------------------------------------------------------------------------- /benchmarking/runner/sinks/mlflow_sink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/sinks/mlflow_sink.py -------------------------------------------------------------------------------- /benchmarking/runner/sinks/sink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/sinks/sink.py -------------------------------------------------------------------------------- /benchmarking/runner/sinks/slack_sink.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/sinks/slack_sink.py -------------------------------------------------------------------------------- /benchmarking/runner/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/runner/utils.py -------------------------------------------------------------------------------- /benchmarking/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarking/scripts/common_crawl_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/scripts/common_crawl_benchmark.py -------------------------------------------------------------------------------- /benchmarking/scripts/dedup_removal_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/scripts/dedup_removal_benchmark.py -------------------------------------------------------------------------------- /benchmarking/scripts/domain_classification_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/scripts/domain_classification_benchmark.py -------------------------------------------------------------------------------- /benchmarking/scripts/dummy_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/scripts/dummy_benchmark.py -------------------------------------------------------------------------------- /benchmarking/scripts/embedding_generation_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/scripts/embedding_generation_benchmark.py -------------------------------------------------------------------------------- /benchmarking/scripts/fuzzy_dedup_identification_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/scripts/fuzzy_dedup_identification_benchmark.py -------------------------------------------------------------------------------- /benchmarking/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarking/tools/build_docker.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/tools/build_docker.sh -------------------------------------------------------------------------------- /benchmarking/tools/gen_runscript_vars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/tools/gen_runscript_vars.py -------------------------------------------------------------------------------- /benchmarking/tools/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/benchmarking/tools/run.sh -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/codecov.yml -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docker/Dockerfile -------------------------------------------------------------------------------- /docker/common/install_ffmpeg.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docker/common/install_ffmpeg.sh -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/README.md -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/assets/styles/ai-assistant.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/assets/styles/ai-assistant.css -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/core/AIClient.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/core/AIClient.js -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/core/ResponseProcessor.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/core/ResponseProcessor.js -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/core/main.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/core/main.js -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/integrations/search-integration.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/integrations/search-integration.js -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/ui/MarkdownProcessor.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/ui/MarkdownProcessor.js -------------------------------------------------------------------------------- /docs/_extensions/ai_assistant/ui/ResponseRenderer.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/ai_assistant/ui/ResponseRenderer.js -------------------------------------------------------------------------------- /docs/_extensions/content_gating/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/content_gating/README.md -------------------------------------------------------------------------------- /docs/_extensions/content_gating/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/content_gating/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/content_gating/condition_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/content_gating/condition_evaluator.py -------------------------------------------------------------------------------- /docs/_extensions/content_gating/conditional_directives.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/content_gating/conditional_directives.py -------------------------------------------------------------------------------- /docs/_extensions/content_gating/document_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/content_gating/document_filter.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/README.md -------------------------------------------------------------------------------- /docs/_extensions/json_output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/config.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/content/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/content/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/content/extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/content/extractor.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/content/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/content/metadata.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/content/structured.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/content/structured.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/content/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/content/text.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/core/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/core/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/core/builder.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/core/document_discovery.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/core/document_discovery.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/core/hierarchy_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/core/hierarchy_builder.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/core/json_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/core/json_formatter.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/core/json_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/core/json_writer.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/processing/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/processing/cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/processing/cache.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/processing/processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/processing/processor.py -------------------------------------------------------------------------------- /docs/_extensions/json_output/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/json_output/utils.py -------------------------------------------------------------------------------- /docs/_extensions/myst_codeblock_substitutions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/myst_codeblock_substitutions.py -------------------------------------------------------------------------------- /docs/_extensions/rich_metadata/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/rich_metadata/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/rich_metadata/templates/layout.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/rich_metadata/templates/layout.html -------------------------------------------------------------------------------- /docs/_extensions/rich_metadata/verify_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/rich_metadata/verify_metadata.py -------------------------------------------------------------------------------- /docs/_extensions/search_assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/__init__.py -------------------------------------------------------------------------------- /docs/_extensions/search_assets/enhanced-search.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/enhanced-search.css -------------------------------------------------------------------------------- /docs/_extensions/search_assets/main.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/main.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/DocumentLoader.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/DocumentLoader.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/EventHandler.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/EventHandler.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/ResultRenderer.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/ResultRenderer.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/SearchEngine.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/SearchEngine.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/SearchInterface.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/SearchInterface.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/SearchPageManager.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/SearchPageManager.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/modules/Utils.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/modules/Utils.js -------------------------------------------------------------------------------- /docs/_extensions/search_assets/templates/search.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_extensions/search_assets/templates/search.html -------------------------------------------------------------------------------- /docs/_images/ablation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_images/ablation.png -------------------------------------------------------------------------------- /docs/_images/scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_images/scaling.png -------------------------------------------------------------------------------- /docs/_images/text-benchmarks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_images/text-benchmarks.png -------------------------------------------------------------------------------- /docs/_templates/autodoc2_index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/_templates/autodoc2_index.rst -------------------------------------------------------------------------------- /docs/about/concepts/audio/asr-pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/asr-pipeline.md -------------------------------------------------------------------------------- /docs/about/concepts/audio/audio-batch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/audio-batch.md -------------------------------------------------------------------------------- /docs/about/concepts/audio/curation-pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/curation-pipeline.md -------------------------------------------------------------------------------- /docs/about/concepts/audio/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/index.md -------------------------------------------------------------------------------- /docs/about/concepts/audio/manifests-ingest.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/manifests-ingest.md -------------------------------------------------------------------------------- /docs/about/concepts/audio/quality-metrics.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/quality-metrics.md -------------------------------------------------------------------------------- /docs/about/concepts/audio/text-integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/audio/text-integration.md -------------------------------------------------------------------------------- /docs/about/concepts/deduplication.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/deduplication.md -------------------------------------------------------------------------------- /docs/about/concepts/image/data-export-concepts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/image/data-export-concepts.md -------------------------------------------------------------------------------- /docs/about/concepts/image/data-loading-concepts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/image/data-loading-concepts.md -------------------------------------------------------------------------------- /docs/about/concepts/image/data-processing-concepts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/image/data-processing-concepts.md -------------------------------------------------------------------------------- /docs/about/concepts/image/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/image/index.md -------------------------------------------------------------------------------- /docs/about/concepts/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/index.md -------------------------------------------------------------------------------- /docs/about/concepts/text/_images/text-processing-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/text/_images/text-processing-diagram.png -------------------------------------------------------------------------------- /docs/about/concepts/text/data-acquisition-concepts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/text/data-acquisition-concepts.md -------------------------------------------------------------------------------- /docs/about/concepts/text/data-curation-pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/text/data-curation-pipeline.md -------------------------------------------------------------------------------- /docs/about/concepts/text/data-loading-concepts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/text/data-loading-concepts.md -------------------------------------------------------------------------------- /docs/about/concepts/text/data-processing-concepts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/text/data-processing-concepts.md -------------------------------------------------------------------------------- /docs/about/concepts/text/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/text/index.md -------------------------------------------------------------------------------- /docs/about/concepts/video/_images/stages-pipelines-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/video/_images/stages-pipelines-diagram.png -------------------------------------------------------------------------------- /docs/about/concepts/video/_images/video-pipeline-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/video/_images/video-pipeline-diagram.png -------------------------------------------------------------------------------- /docs/about/concepts/video/abstractions.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/video/abstractions.md -------------------------------------------------------------------------------- /docs/about/concepts/video/architecture.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/video/architecture.md -------------------------------------------------------------------------------- /docs/about/concepts/video/data-flow.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/video/data-flow.md -------------------------------------------------------------------------------- /docs/about/concepts/video/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/concepts/video/index.md -------------------------------------------------------------------------------- /docs/about/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/index.md -------------------------------------------------------------------------------- /docs/about/key-features.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/key-features.md -------------------------------------------------------------------------------- /docs/about/release-notes/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/release-notes/index.md -------------------------------------------------------------------------------- /docs/about/release-notes/migration-faq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/release-notes/migration-faq.md -------------------------------------------------------------------------------- /docs/about/release-notes/migration-guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/about/release-notes/migration-guide.md -------------------------------------------------------------------------------- /docs/admin/deployment/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/admin/deployment/index.md -------------------------------------------------------------------------------- /docs/admin/deployment/requirements.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/admin/deployment/requirements.md -------------------------------------------------------------------------------- /docs/admin/deployment/slurm/image.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/admin/deployment/slurm/image.md -------------------------------------------------------------------------------- /docs/admin/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/admin/index.md -------------------------------------------------------------------------------- /docs/admin/installation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/admin/installation.md -------------------------------------------------------------------------------- /docs/admin/integrations/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/admin/integrations/index.md -------------------------------------------------------------------------------- /docs/broken_links_needing_review.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/broken_links_needing_review.json -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/conf.py -------------------------------------------------------------------------------- /docs/curate-audio/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/index.md -------------------------------------------------------------------------------- /docs/curate-audio/load-data/custom-manifests.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/load-data/custom-manifests.md -------------------------------------------------------------------------------- /docs/curate-audio/load-data/fleurs-dataset.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/load-data/fleurs-dataset.md -------------------------------------------------------------------------------- /docs/curate-audio/load-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/load-data/index.md -------------------------------------------------------------------------------- /docs/curate-audio/load-data/local-files.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/load-data/local-files.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/asr-inference/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/asr-inference/index.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/asr-inference/nemo-models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/asr-inference/nemo-models.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/audio-analysis/duration-calculation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/audio-analysis/duration-calculation.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/audio-analysis/format-validation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/audio-analysis/format-validation.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/audio-analysis/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/audio-analysis/index.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/index.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/quality-assessment/duration-filtering.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/quality-assessment/duration-filtering.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/quality-assessment/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/quality-assessment/index.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/quality-assessment/wer-filtering.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/quality-assessment/wer-filtering.md -------------------------------------------------------------------------------- /docs/curate-audio/process-data/text-integration/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/process-data/text-integration/index.md -------------------------------------------------------------------------------- /docs/curate-audio/save-export.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/save-export.md -------------------------------------------------------------------------------- /docs/curate-audio/tutorials/beginner.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/tutorials/beginner.md -------------------------------------------------------------------------------- /docs/curate-audio/tutorials/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-audio/tutorials/index.md -------------------------------------------------------------------------------- /docs/curate-images/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/index.md -------------------------------------------------------------------------------- /docs/curate-images/load-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/load-data/index.md -------------------------------------------------------------------------------- /docs/curate-images/load-data/tar-archives.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/load-data/tar-archives.md -------------------------------------------------------------------------------- /docs/curate-images/process-data/embeddings/clip-embedder.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/process-data/embeddings/clip-embedder.md -------------------------------------------------------------------------------- /docs/curate-images/process-data/embeddings/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/process-data/embeddings/index.md -------------------------------------------------------------------------------- /docs/curate-images/process-data/filters/aesthetic.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/process-data/filters/aesthetic.md -------------------------------------------------------------------------------- /docs/curate-images/process-data/filters/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/process-data/filters/index.md -------------------------------------------------------------------------------- /docs/curate-images/process-data/filters/nsfw.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/process-data/filters/nsfw.md -------------------------------------------------------------------------------- /docs/curate-images/process-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/process-data/index.md -------------------------------------------------------------------------------- /docs/curate-images/save-export.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/save-export.md -------------------------------------------------------------------------------- /docs/curate-images/tutorials/beginner.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/tutorials/beginner.md -------------------------------------------------------------------------------- /docs/curate-images/tutorials/dedup-workflow.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/tutorials/dedup-workflow.md -------------------------------------------------------------------------------- /docs/curate-images/tutorials/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-images/tutorials/index.md -------------------------------------------------------------------------------- /docs/curate-text/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/index.md -------------------------------------------------------------------------------- /docs/curate-text/load-data/arxiv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/load-data/arxiv.md -------------------------------------------------------------------------------- /docs/curate-text/load-data/common-crawl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/load-data/common-crawl.md -------------------------------------------------------------------------------- /docs/curate-text/load-data/custom.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/load-data/custom.md -------------------------------------------------------------------------------- /docs/curate-text/load-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/load-data/index.md -------------------------------------------------------------------------------- /docs/curate-text/load-data/read-existing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/load-data/read-existing.md -------------------------------------------------------------------------------- /docs/curate-text/load-data/wikipedia.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/load-data/wikipedia.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/content-processing/add-id.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/content-processing/add-id.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/content-processing/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/content-processing/index.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/content-processing/text-cleaning.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/content-processing/text-cleaning.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/deduplication/exact.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/deduplication/exact.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/deduplication/fuzzy.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/deduplication/fuzzy.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/deduplication/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/deduplication/index.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/deduplication/semdedup.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/deduplication/semdedup.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/index.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/language-management/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/language-management/index.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/language-management/language.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/language-management/language.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/language-management/stopwords.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/language-management/stopwords.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/quality-assessment/classifier.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/quality-assessment/classifier.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/quality-assessment/distributed-classifier.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/quality-assessment/distributed-classifier.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/quality-assessment/heuristic.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/quality-assessment/heuristic.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/quality-assessment/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/quality-assessment/index.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/specialized-processing/code.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/specialized-processing/code.md -------------------------------------------------------------------------------- /docs/curate-text/process-data/specialized-processing/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/process-data/specialized-processing/index.md -------------------------------------------------------------------------------- /docs/curate-text/tutorials/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-text/tutorials/index.md -------------------------------------------------------------------------------- /docs/curate-video/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/index.md -------------------------------------------------------------------------------- /docs/curate-video/load-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/load-data/index.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/captions-preview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/captions-preview.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/clipping.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/clipping.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/dedup.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/dedup.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/embeddings.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/embeddings.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/filtering.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/filtering.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/frame-extraction.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/frame-extraction.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/index.md -------------------------------------------------------------------------------- /docs/curate-video/process-data/transcoding.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/process-data/transcoding.md -------------------------------------------------------------------------------- /docs/curate-video/save-export.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/save-export.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/_images/dedup-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/_images/dedup-plot.png -------------------------------------------------------------------------------- /docs/curate-video/tutorials/beginner.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/beginner.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/index.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/pipeline-customization/add-cust-code.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/pipeline-customization/add-cust-code.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/pipeline-customization/add-cust-env.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/pipeline-customization/add-cust-env.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/pipeline-customization/add-cust-model.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/pipeline-customization/add-cust-model.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/pipeline-customization/add-cust-stage.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/pipeline-customization/add-cust-stage.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/pipeline-customization/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/pipeline-customization/index.md -------------------------------------------------------------------------------- /docs/curate-video/tutorials/split-dedup.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/curate-video/tutorials/split-dedup.md -------------------------------------------------------------------------------- /docs/get-started/audio.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/get-started/audio.md -------------------------------------------------------------------------------- /docs/get-started/image.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/get-started/image.md -------------------------------------------------------------------------------- /docs/get-started/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/get-started/index.md -------------------------------------------------------------------------------- /docs/get-started/text.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/get-started/text.md -------------------------------------------------------------------------------- /docs/get-started/video.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/get-started/video.md -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/index.md -------------------------------------------------------------------------------- /docs/project.json: -------------------------------------------------------------------------------- 1 | {"name": "nemo-curator", "version": "25.09"} -------------------------------------------------------------------------------- /docs/reference/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/index.md -------------------------------------------------------------------------------- /docs/reference/infrastructure/container-environments.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/infrastructure/container-environments.md -------------------------------------------------------------------------------- /docs/reference/infrastructure/execution-backends.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/infrastructure/execution-backends.md -------------------------------------------------------------------------------- /docs/reference/infrastructure/gpu-processing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/infrastructure/gpu-processing.md -------------------------------------------------------------------------------- /docs/reference/infrastructure/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/infrastructure/index.md -------------------------------------------------------------------------------- /docs/reference/infrastructure/memory-management.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/infrastructure/memory-management.md -------------------------------------------------------------------------------- /docs/reference/infrastructure/resumable-processing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/infrastructure/resumable-processing.md -------------------------------------------------------------------------------- /docs/reference/related-tools.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/reference/related-tools.md -------------------------------------------------------------------------------- /docs/versions1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/docs/versions1.json -------------------------------------------------------------------------------- /external/intern_video2_installation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/external/intern_video2_installation.sh -------------------------------------------------------------------------------- /external/intern_video2_multimodal.patch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/external/intern_video2_multimodal.patch -------------------------------------------------------------------------------- /nemo_curator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/__init__.py -------------------------------------------------------------------------------- /nemo_curator/backends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/backends/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/base.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_actor_pool/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_actor_pool/__init__.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_actor_pool/adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_actor_pool/adapter.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_actor_pool/executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_actor_pool/executor.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_actor_pool/raft_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_actor_pool/raft_adapter.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_actor_pool/shuffle_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_actor_pool/shuffle_adapter.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_actor_pool/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_actor_pool/utils.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_data/__init__.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_data/adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_data/adapter.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_data/executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_data/executor.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/ray_data/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/ray_data/utils.py -------------------------------------------------------------------------------- /nemo_curator/backends/experimental/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/experimental/utils.py -------------------------------------------------------------------------------- /nemo_curator/backends/internal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/backends/internal/raft/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/backends/internal/raft/ray_comms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/internal/raft/ray_comms.py -------------------------------------------------------------------------------- /nemo_curator/backends/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/utils.py -------------------------------------------------------------------------------- /nemo_curator/backends/xenna/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/xenna/__init__.py -------------------------------------------------------------------------------- /nemo_curator/backends/xenna/adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/xenna/adapter.py -------------------------------------------------------------------------------- /nemo_curator/backends/xenna/executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/backends/xenna/executor.py -------------------------------------------------------------------------------- /nemo_curator/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/config.py -------------------------------------------------------------------------------- /nemo_curator/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/core/client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/core/client.py -------------------------------------------------------------------------------- /nemo_curator/core/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/core/constants.py -------------------------------------------------------------------------------- /nemo_curator/core/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/core/utils.py -------------------------------------------------------------------------------- /nemo_curator/metrics/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/metrics/README.md -------------------------------------------------------------------------------- /nemo_curator/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/metrics/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/metrics/constants.py -------------------------------------------------------------------------------- /nemo_curator/metrics/start_prometheus_grafana.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/metrics/start_prometheus_grafana.py -------------------------------------------------------------------------------- /nemo_curator/metrics/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/metrics/utils.py -------------------------------------------------------------------------------- /nemo_curator/metrics/xenna_grafana_dashboard.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/metrics/xenna_grafana_dashboard.json -------------------------------------------------------------------------------- /nemo_curator/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/models/aesthetics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/aesthetics.py -------------------------------------------------------------------------------- /nemo_curator/models/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/base.py -------------------------------------------------------------------------------- /nemo_curator/models/client/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/client/__init__.py -------------------------------------------------------------------------------- /nemo_curator/models/client/llm_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/client/llm_client.py -------------------------------------------------------------------------------- /nemo_curator/models/client/openai_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/client/openai_client.py -------------------------------------------------------------------------------- /nemo_curator/models/clip.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/clip.py -------------------------------------------------------------------------------- /nemo_curator/models/cosmos_embed1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/cosmos_embed1.py -------------------------------------------------------------------------------- /nemo_curator/models/internvideo2_mm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/internvideo2_mm.py -------------------------------------------------------------------------------- /nemo_curator/models/nsfw.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/nsfw.py -------------------------------------------------------------------------------- /nemo_curator/models/prompt_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/prompt_formatter.py -------------------------------------------------------------------------------- /nemo_curator/models/qwen_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/qwen_lm.py -------------------------------------------------------------------------------- /nemo_curator/models/qwen_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/qwen_vl.py -------------------------------------------------------------------------------- /nemo_curator/models/transnetv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/models/transnetv2.py -------------------------------------------------------------------------------- /nemo_curator/package_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/package_info.py -------------------------------------------------------------------------------- /nemo_curator/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/pipeline/__init__.py -------------------------------------------------------------------------------- /nemo_curator/pipeline/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/pipeline/pipeline.py -------------------------------------------------------------------------------- /nemo_curator/stages/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/audio/common.py -------------------------------------------------------------------------------- /nemo_curator/stages/audio/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/datasets/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/audio/datasets/file_utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/audio/datasets/fleurs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/datasets/fleurs/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/audio/datasets/fleurs/create_initial_manifest.py -------------------------------------------------------------------------------- /nemo_curator/stages/audio/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/inference/asr_nemo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/audio/inference/asr_nemo.py -------------------------------------------------------------------------------- /nemo_curator/stages/audio/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/io/convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/audio/io/convert.py -------------------------------------------------------------------------------- /nemo_curator/stages/audio/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/audio/metrics/get_wer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/audio/metrics/get_wer.py -------------------------------------------------------------------------------- /nemo_curator/stages/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/client_partitioning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/client_partitioning.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/exact/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/exact/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/exact/identification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/exact/identification.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/exact/workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/exact/workflow.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/buckets_to_edges.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/buckets_to_edges.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/connected_components.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/connected_components.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/lsh/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/lsh/lsh.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/lsh/lsh.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/lsh/stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/lsh/stage.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/minhash.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/fuzzy/workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/fuzzy/workflow.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/gpu_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/gpu_utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/id_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/id_generator.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/io_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/io_utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/identify_duplicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/identify_duplicates.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/kmeans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/kmeans.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/pairwise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/pairwise.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/pairwise_io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/pairwise_io.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/ranking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/ranking.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/semantic/workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/semantic/workflow.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/shuffle_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/shuffle_utils/rapidsmpf_shuffler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/shuffle_utils/rapidsmpf_shuffler.py -------------------------------------------------------------------------------- /nemo_curator/stages/deduplication/shuffle_utils/stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/deduplication/shuffle_utils/stage.py -------------------------------------------------------------------------------- /nemo_curator/stages/file_partitioning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/file_partitioning.py -------------------------------------------------------------------------------- /nemo_curator/stages/function_decorators.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/function_decorators.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/image/deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/image/deduplication/removal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/deduplication/removal.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/embedders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/embedders/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/embedders/clip_embedder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/embedders/clip_embedder.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/filters/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/filters/aesthetic_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/filters/aesthetic_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/filters/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/filters/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/filters/nsfw_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/filters/nsfw_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/io/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/io/convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/io/convert.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/io/image_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/io/image_reader.py -------------------------------------------------------------------------------- /nemo_curator/stages/image/io/image_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/image/io/image_writer.py -------------------------------------------------------------------------------- /nemo_curator/stages/resources.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/resources.py -------------------------------------------------------------------------------- /nemo_curator/stages/synthetic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/synthetic/nemotron_cc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/synthetic/nemotron_cc/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/synthetic/nemotron_cc/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/synthetic/nemotron_cc/nemotron_cc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/synthetic/nemotron_cc/nemotron_cc.py -------------------------------------------------------------------------------- /nemo_curator/stages/synthetic/nemotron_cc/prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/synthetic/nemotron_cc/prompts.py -------------------------------------------------------------------------------- /nemo_curator/stages/synthetic/qa_multilingual_synthetic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/synthetic/qa_multilingual_synthetic.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/README.md -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/aegis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/aegis.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/aegis_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/aegis_utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/constants.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/content_type.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/content_type.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/domain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/domain.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/fineweb_edu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/fineweb_edu.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/prompt_task_complexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/prompt_task_complexity.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/classifiers/quality.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/classifiers/quality.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/deduplication/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/deduplication/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/deduplication/removal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/deduplication/removal.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/deduplication/removal_workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/deduplication/removal_workflow.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/deduplication/semantic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/deduplication/semantic.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/README.md -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/arxiv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/arxiv/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/arxiv/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/arxiv/download.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/arxiv/extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/arxiv/extract.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/arxiv/iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/arxiv/iterator.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/arxiv/stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/arxiv/stage.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/arxiv/url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/arxiv/url_generation.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/base/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/base/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/base/download.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/base/extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/base/extract.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/base/iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/base/iterator.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/base/stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/base/stage.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/base/url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/base/url_generation.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/common_crawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/common_crawl/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/common_crawl/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/common_crawl/download.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/common_crawl/extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/common_crawl/extract.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/common_crawl/stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/common_crawl/stage.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/common_crawl/url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/common_crawl/url_generation.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/common_crawl/warc_iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/common_crawl/warc_iterator.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/justext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/justext.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/resiliparse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/resiliparse.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/trafilatura.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/trafilatura.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/utils/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/utils/ja_stopwords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/utils/ja_stopwords.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/utils/th_stopwords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/utils/th_stopwords.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/html_extractors/utils/zh_stopwords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/html_extractors/utils/zh_stopwords.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/wikipedia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/wikipedia/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/wikipedia/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/wikipedia/download.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/wikipedia/extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/wikipedia/extract.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/wikipedia/iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/wikipedia/iterator.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/wikipedia/stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/wikipedia/stage.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/download/wikipedia/url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/download/wikipedia/url_generation.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/embedders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/embedders/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/embedders/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/embedders/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/embedders/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/embedders/utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/filters/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/filters/code.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/filters/code.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/filters/doc_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/filters/doc_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/filters/fasttext_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/filters/fasttext_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/filters/heuristic_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/filters/heuristic_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/reader/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/reader/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/reader/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/reader/jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/reader/jsonl.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/reader/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/reader/parquet.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/writer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/writer/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/writer/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/writer/base.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/writer/jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/writer/jsonl.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/writer/parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/writer/parquet.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/io/writer/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/io/writer/utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/text/models/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/models/model.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/models/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/models/tokenizer.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/models/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/models/utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/c4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/c4.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/doc_modifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/doc_modifier.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/fasttext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/fasttext.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/line_remover.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/line_remover.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/markdown_remover.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/markdown_remover.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/newline_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/newline_normalizer.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/quotation_remover.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/quotation_remover.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/slicer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/slicer.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/unicode_reformatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/unicode_reformatter.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modifiers/url_remover.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modifiers/url_remover.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modules/__init__.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modules/add_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modules/add_id.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modules/joiner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modules/joiner.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modules/modifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modules/modifier.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modules/score_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modules/score_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/modules/splitter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/modules/splitter.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/text/utils/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/utils/constants.py -------------------------------------------------------------------------------- /nemo_curator/stages/text/utils/text_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/text/utils/text_utils.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/caption/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/caption/caption_enhancement.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/caption/caption_enhancement.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/caption/caption_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/caption/caption_generation.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/caption/caption_preparation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/caption/caption_preparation.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/clipping/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/clipping/clip_extraction_stages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/clipping/clip_extraction_stages.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/clipping/clip_frame_extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/clipping/clip_frame_extraction.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/clipping/transnetv2_extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/clipping/transnetv2_extraction.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/clipping/video_frame_extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/clipping/video_frame_extraction.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/embedding/cosmos_embed1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/embedding/cosmos_embed1.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/embedding/internvideo2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/embedding/internvideo2.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/filtering/clip_aesthetic_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/filtering/clip_aesthetic_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/filtering/motion_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/filtering/motion_filter.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/filtering/motion_vector_backend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/filtering/motion_vector_backend.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/io/clip_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/io/clip_writer.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/io/video_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/io/video_reader.py -------------------------------------------------------------------------------- /nemo_curator/stages/video/preview/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/stages/video/preview/preview.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/stages/video/preview/preview.py -------------------------------------------------------------------------------- /nemo_curator/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/__init__.py -------------------------------------------------------------------------------- /nemo_curator/tasks/audio_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/audio_batch.py -------------------------------------------------------------------------------- /nemo_curator/tasks/document.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/document.py -------------------------------------------------------------------------------- /nemo_curator/tasks/file_group.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/file_group.py -------------------------------------------------------------------------------- /nemo_curator/tasks/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/image.py -------------------------------------------------------------------------------- /nemo_curator/tasks/tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/tasks.py -------------------------------------------------------------------------------- /nemo_curator/tasks/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/utils.py -------------------------------------------------------------------------------- /nemo_curator/tasks/video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/tasks/video.py -------------------------------------------------------------------------------- /nemo_curator/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nemo_curator/utils/client_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/client_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/code_meta.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/code_meta.csv -------------------------------------------------------------------------------- /nemo_curator/utils/column_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/column_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/decoder_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/decoder_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/file_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/grouping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/grouping.py -------------------------------------------------------------------------------- /nemo_curator/utils/hf_download_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/hf_download_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/nvcodec_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/nvcodec_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/operation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/operation_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/performance_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/performance_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/split_large_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/split_large_files.py -------------------------------------------------------------------------------- /nemo_curator/utils/storage_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/storage_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/windowing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/windowing_utils.py -------------------------------------------------------------------------------- /nemo_curator/utils/writer_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/nemo_curator/utils/writer_utils.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements-docs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/requirements-docs.txt -------------------------------------------------------------------------------- /tests/L0_Unit_Test_GPU.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/L0_Unit_Test_GPU.sh -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/backends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/backends/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/backends/experimental/ray_actor_pool/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/backends/experimental/ray_actor_pool/test_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/backends/experimental/ray_actor_pool/test_executor.py -------------------------------------------------------------------------------- /tests/backends/experimental/ray_data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/backends/experimental/ray_data/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/backends/experimental/ray_data/test_utils.py -------------------------------------------------------------------------------- /tests/backends/experimental/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/backends/experimental/test_utils.py -------------------------------------------------------------------------------- /tests/backends/test_integration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/backends/test_integration.py -------------------------------------------------------------------------------- /tests/backends/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/backends/test_utils.py -------------------------------------------------------------------------------- /tests/backends/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/backends/utils.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/test_get_ray_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/core/test_get_ray_client.py -------------------------------------------------------------------------------- /tests/data/audio/armenian/fleurs/dev.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/data/audio/armenian/fleurs/dev.tar.gz -------------------------------------------------------------------------------- /tests/data/audio/armenian/fleurs/dev.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/data/audio/armenian/fleurs/dev.tsv -------------------------------------------------------------------------------- /tests/data/audio/armenian/fleurs/test_data_reference.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/data/audio/armenian/fleurs/test_data_reference.json -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/client/test_llm_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/client/test_llm_client.py -------------------------------------------------------------------------------- /tests/models/client/test_openai_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/client/test_openai_client.py -------------------------------------------------------------------------------- /tests/models/test_aesthetics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_aesthetics.py -------------------------------------------------------------------------------- /tests/models/test_clip.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_clip.py -------------------------------------------------------------------------------- /tests/models/test_cosmos_embed1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_cosmos_embed1.py -------------------------------------------------------------------------------- /tests/models/test_internvideo2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_internvideo2.py -------------------------------------------------------------------------------- /tests/models/test_internvideo2_mm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_internvideo2_mm.py -------------------------------------------------------------------------------- /tests/models/test_prompt_formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_prompt_formatter.py -------------------------------------------------------------------------------- /tests/models/test_qwen_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_qwen_lm.py -------------------------------------------------------------------------------- /tests/models/test_qwen_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_qwen_vl.py -------------------------------------------------------------------------------- /tests/models/test_transnetv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/models/test_transnetv2.py -------------------------------------------------------------------------------- /tests/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/pipelines/test_pipelines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/pipelines/test_pipelines.py -------------------------------------------------------------------------------- /tests/stages/audio/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/audio/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/audio/datasets/test_fleurs_create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/audio/datasets/test_fleurs_create_initial_manifest.py -------------------------------------------------------------------------------- /tests/stages/audio/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/audio/inference/test_asr_nemo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/audio/inference/test_asr_nemo.py -------------------------------------------------------------------------------- /tests/stages/audio/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/audio/io/test_convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/audio/io/test_convert.py -------------------------------------------------------------------------------- /tests/stages/audio/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/audio/metrics/test_get_wer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/audio/metrics/test_get_wer.py -------------------------------------------------------------------------------- /tests/stages/audio/test_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/audio/test_common.py -------------------------------------------------------------------------------- /tests/stages/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/common/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/common/test_base.py -------------------------------------------------------------------------------- /tests/stages/common/test_client_partitioning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/common/test_client_partitioning.py -------------------------------------------------------------------------------- /tests/stages/common/test_file_partitioning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/common/test_file_partitioning.py -------------------------------------------------------------------------------- /tests/stages/common/test_function_decorators.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/common/test_function_decorators.py -------------------------------------------------------------------------------- /tests/stages/deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/deduplication/exact/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/deduplication/exact/test_identification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/exact/test_identification.py -------------------------------------------------------------------------------- /tests/stages/deduplication/exact/test_workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/exact/test_workflow.py -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/test_buckets_to_edges_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/fuzzy/test_buckets_to_edges_stage.py -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/test_connected_components_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/fuzzy/test_connected_components_stage.py -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/test_fuzzy_workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/fuzzy/test_fuzzy_workflow.py -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/test_lsh_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/fuzzy/test_lsh_stage.py -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/test_minhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/fuzzy/test_minhash.py -------------------------------------------------------------------------------- /tests/stages/deduplication/fuzzy/test_minhash_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/fuzzy/test_minhash_stage.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_identify_duplicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_identify_duplicates.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_kmeans.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_kmeans.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_pairwise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_pairwise.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_pairwise_io.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_pairwise_io.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_ranking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_ranking.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_utils.py -------------------------------------------------------------------------------- /tests/stages/deduplication/semantic/test_workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/semantic/test_workflow.py -------------------------------------------------------------------------------- /tests/stages/deduplication/shuffle_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/deduplication/shuffle_utils/test_shuffle_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/shuffle_utils/test_shuffle_stage.py -------------------------------------------------------------------------------- /tests/stages/deduplication/test_id_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/deduplication/test_id_generator.py -------------------------------------------------------------------------------- /tests/stages/image/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/image/dedup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/image/dedup/test_dedup_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/dedup/test_dedup_filter.py -------------------------------------------------------------------------------- /tests/stages/image/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/image/embedders/test_clip_embedder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/embedders/test_clip_embedder.py -------------------------------------------------------------------------------- /tests/stages/image/filters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/image/filters/test_aesthetic_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/filters/test_aesthetic_filter.py -------------------------------------------------------------------------------- /tests/stages/image/filters/test_nsfw_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/filters/test_nsfw_filter.py -------------------------------------------------------------------------------- /tests/stages/image/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/image/io/test_convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/io/test_convert.py -------------------------------------------------------------------------------- /tests/stages/image/io/test_image_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/io/test_image_reader.py -------------------------------------------------------------------------------- /tests/stages/image/io/test_image_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/image/io/test_image_writer.py -------------------------------------------------------------------------------- /tests/stages/synthetic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/synthetic/nemotron_cc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/synthetic/nemotron_cc/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/synthetic/nemotron_cc/test_base.py -------------------------------------------------------------------------------- /tests/stages/synthetic/nemotron_cc/test_nemotron_cc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/synthetic/nemotron_cc/test_nemotron_cc.py -------------------------------------------------------------------------------- /tests/stages/synthetic/test_qa_multilingual_synthetic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/synthetic/test_qa_multilingual_synthetic.py -------------------------------------------------------------------------------- /tests/stages/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/classifiers/test_classifiers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/classifiers/test_classifiers.py -------------------------------------------------------------------------------- /tests/stages/text/deduplication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/deduplication/test_removal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/deduplication/test_removal.py -------------------------------------------------------------------------------- /tests/stages/text/deduplication/test_removal_workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/deduplication/test_removal_workflow.py -------------------------------------------------------------------------------- /tests/stages/text/deduplication/test_semantic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/deduplication/test_semantic.py -------------------------------------------------------------------------------- /tests/stages/text/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/download/arxiv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/download/arxiv/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/arxiv/test_download.py -------------------------------------------------------------------------------- /tests/stages/text/download/arxiv/test_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/arxiv/test_extract.py -------------------------------------------------------------------------------- /tests/stages/text/download/arxiv/test_iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/arxiv/test_iterator.py -------------------------------------------------------------------------------- /tests/stages/text/download/arxiv/test_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/arxiv/test_stage.py -------------------------------------------------------------------------------- /tests/stages/text/download/arxiv/test_url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/arxiv/test_url_generation.py -------------------------------------------------------------------------------- /tests/stages/text/download/base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/download/base/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/base/test_download.py -------------------------------------------------------------------------------- /tests/stages/text/download/base/test_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/base/test_extract.py -------------------------------------------------------------------------------- /tests/stages/text/download/base/test_iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/base/test_iterator.py -------------------------------------------------------------------------------- /tests/stages/text/download/base/test_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/base/test_stage.py -------------------------------------------------------------------------------- /tests/stages/text/download/base/test_url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/base/test_url_generation.py -------------------------------------------------------------------------------- /tests/stages/text/download/common_crawl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/download/common_crawl/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/common_crawl/test_download.py -------------------------------------------------------------------------------- /tests/stages/text/download/common_crawl/test_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/common_crawl/test_extract.py -------------------------------------------------------------------------------- /tests/stages/text/download/common_crawl/test_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/common_crawl/test_stage.py -------------------------------------------------------------------------------- /tests/stages/text/download/common_crawl/test_url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/common_crawl/test_url_generation.py -------------------------------------------------------------------------------- /tests/stages/text/download/common_crawl/test_warc_iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/common_crawl/test_warc_iterator.py -------------------------------------------------------------------------------- /tests/stages/text/download/test_html_extractors_implementation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/test_html_extractors_implementation.py -------------------------------------------------------------------------------- /tests/stages/text/download/wikipedia/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/download/wikipedia/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/wikipedia/test_download.py -------------------------------------------------------------------------------- /tests/stages/text/download/wikipedia/test_extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/wikipedia/test_extract.py -------------------------------------------------------------------------------- /tests/stages/text/download/wikipedia/test_iterator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/wikipedia/test_iterator.py -------------------------------------------------------------------------------- /tests/stages/text/download/wikipedia/test_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/wikipedia/test_stage.py -------------------------------------------------------------------------------- /tests/stages/text/download/wikipedia/test_url_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/download/wikipedia/test_url_generation.py -------------------------------------------------------------------------------- /tests/stages/text/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/embedders/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/embedders/test_base.py -------------------------------------------------------------------------------- /tests/stages/text/embedders/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/embedders/test_utils.py -------------------------------------------------------------------------------- /tests/stages/text/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/io/reader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/io/reader/test_integration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/reader/test_integration.py -------------------------------------------------------------------------------- /tests/stages/text/io/reader/test_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/reader/test_jsonl.py -------------------------------------------------------------------------------- /tests/stages/text/io/reader/test_parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/reader/test_parquet.py -------------------------------------------------------------------------------- /tests/stages/text/io/writer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/io/writer/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/writer/conftest.py -------------------------------------------------------------------------------- /tests/stages/text/io/writer/test_jsonl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/writer/test_jsonl.py -------------------------------------------------------------------------------- /tests/stages/text/io/writer/test_parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/writer/test_parquet.py -------------------------------------------------------------------------------- /tests/stages/text/io/writer/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/io/writer/test_utils.py -------------------------------------------------------------------------------- /tests/stages/text/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/models/test_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/models/test_model.py -------------------------------------------------------------------------------- /tests/stages/text/models/test_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/models/test_tokenizer.py -------------------------------------------------------------------------------- /tests/stages/text/models/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/models/test_utils.py -------------------------------------------------------------------------------- /tests/stages/text/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/text/modules/test_add_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/modules/test_add_id.py -------------------------------------------------------------------------------- /tests/stages/text/modules/test_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/modules/test_filters.py -------------------------------------------------------------------------------- /tests/stages/text/modules/test_joiner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/modules/test_joiner.py -------------------------------------------------------------------------------- /tests/stages/text/modules/test_modifiers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/modules/test_modifiers.py -------------------------------------------------------------------------------- /tests/stages/text/modules/test_splitter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/text/modules/test_splitter.py -------------------------------------------------------------------------------- /tests/stages/video/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/caption/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/caption/test_caption_enhancement.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/caption/test_caption_enhancement.py -------------------------------------------------------------------------------- /tests/stages/video/caption/test_caption_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/caption/test_caption_generation.py -------------------------------------------------------------------------------- /tests/stages/video/caption/test_caption_preparation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/caption/test_caption_preparation.py -------------------------------------------------------------------------------- /tests/stages/video/clipping/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/clipping/test_clip_frame_extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/clipping/test_clip_frame_extraction.py -------------------------------------------------------------------------------- /tests/stages/video/clipping/test_clip_transcoding_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/clipping/test_clip_transcoding_stage.py -------------------------------------------------------------------------------- /tests/stages/video/clipping/test_fixed_stride_extractor_stage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/clipping/test_fixed_stride_extractor_stage.py -------------------------------------------------------------------------------- /tests/stages/video/clipping/test_transnetv2_extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/clipping/test_transnetv2_extraction.py -------------------------------------------------------------------------------- /tests/stages/video/clipping/test_video_frame_extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/clipping/test_video_frame_extraction.py -------------------------------------------------------------------------------- /tests/stages/video/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/embedding/test_cosmos_embed1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/embedding/test_cosmos_embed1.py -------------------------------------------------------------------------------- /tests/stages/video/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/filtering/test_clip_aesthetic_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/filtering/test_clip_aesthetic_filter.py -------------------------------------------------------------------------------- /tests/stages/video/filtering/test_motion_filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/filtering/test_motion_filter.py -------------------------------------------------------------------------------- /tests/stages/video/filtering/test_motion_vector_backend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/filtering/test_motion_vector_backend.py -------------------------------------------------------------------------------- /tests/stages/video/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/io/test_clip_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/io/test_clip_writer.py -------------------------------------------------------------------------------- /tests/stages/video/io/test_video_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/io/test_video_reader.py -------------------------------------------------------------------------------- /tests/stages/video/preview/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stages/video/preview/test_preview.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/stages/video/preview/test_preview.py -------------------------------------------------------------------------------- /tests/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tasks/test_audio_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/tasks/test_audio_batch.py -------------------------------------------------------------------------------- /tests/tasks/test_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/tasks/test_tasks.py -------------------------------------------------------------------------------- /tests/tasks/test_video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/tasks/test_video.py -------------------------------------------------------------------------------- /tests/test___init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/test___init__.py -------------------------------------------------------------------------------- /tests/test_cudf_placeholder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/test_cudf_placeholder.py -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils/test_client_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_client_utils.py -------------------------------------------------------------------------------- /tests/utils/test_column_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_column_utils.py -------------------------------------------------------------------------------- /tests/utils/test_decoder_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_decoder_utils.py -------------------------------------------------------------------------------- /tests/utils/test_file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_file_utils.py -------------------------------------------------------------------------------- /tests/utils/test_grouping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_grouping.py -------------------------------------------------------------------------------- /tests/utils/test_nvcodec_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_nvcodec_utils.py -------------------------------------------------------------------------------- /tests/utils/test_operation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_operation_utils.py -------------------------------------------------------------------------------- /tests/utils/test_split_large_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_split_large_files.py -------------------------------------------------------------------------------- /tests/utils/test_writer_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tests/utils/test_writer_utils.py -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/README.md -------------------------------------------------------------------------------- /tutorials/audio/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/audio/README.md -------------------------------------------------------------------------------- /tutorials/audio/fleurs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/audio/fleurs/README.md -------------------------------------------------------------------------------- /tutorials/audio/fleurs/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/audio/fleurs/pipeline.py -------------------------------------------------------------------------------- /tutorials/audio/fleurs/pipeline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/audio/fleurs/pipeline.yaml -------------------------------------------------------------------------------- /tutorials/audio/fleurs/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/audio/fleurs/run.py -------------------------------------------------------------------------------- /tutorials/image/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/image/README.md -------------------------------------------------------------------------------- /tutorials/image/getting-started/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/image/getting-started/README.md -------------------------------------------------------------------------------- /tutorials/image/getting-started/helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/image/getting-started/helper.py -------------------------------------------------------------------------------- /tutorials/image/getting-started/image_curation_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/image/getting-started/image_curation_example.py -------------------------------------------------------------------------------- /tutorials/image/getting-started/image_dedup_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/image/getting-started/image_dedup_example.py -------------------------------------------------------------------------------- /tutorials/quickstart.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/quickstart.py -------------------------------------------------------------------------------- /tutorials/synthetic/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/synthetic/README.md -------------------------------------------------------------------------------- /tutorials/synthetic/nemotron_cc/example_data/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/synthetic/nemotron_cc/example_data/data.parquet -------------------------------------------------------------------------------- /tutorials/synthetic/nemotron_cc/nemotron_cc_pipelines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/synthetic/nemotron_cc/nemotron_cc_pipelines.py -------------------------------------------------------------------------------- /tutorials/synthetic/nemotron_cc/nemotron_cc_sdg_high_quality_example_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/synthetic/nemotron_cc/nemotron_cc_sdg_high_quality_example_pipeline.py -------------------------------------------------------------------------------- /tutorials/synthetic/nemotron_cc/nemotron_cc_sdg_low_quality_example_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/synthetic/nemotron_cc/nemotron_cc_sdg_low_quality_example_pipeline.py -------------------------------------------------------------------------------- /tutorials/synthetic/synthetic_data_generation_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/synthetic/synthetic_data_generation_example.py -------------------------------------------------------------------------------- /tutorials/text/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/README.md -------------------------------------------------------------------------------- /tutorials/text/deduplication/semantic/semantic_e2e.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/deduplication/semantic/semantic_e2e.ipynb -------------------------------------------------------------------------------- /tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/README.md -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/aegis-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/aegis-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/content-type-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/content-type-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/domain-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/domain-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/fineweb-edu-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/fineweb-edu-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/fineweb-mixtral-edu-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/fineweb-mixtral-edu-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/fineweb-nemotron-edu-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/fineweb-nemotron-edu-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/instruction-data-guard-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/instruction-data-guard-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/multilingual-domain-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/multilingual-domain-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/prompt-task-complexity-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/prompt-task-complexity-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/distributed-data-classification/quality-classification.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/distributed-data-classification/quality-classification.ipynb -------------------------------------------------------------------------------- /tutorials/text/download-and-extract/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/download-and-extract/README.md -------------------------------------------------------------------------------- /tutorials/text/download-and-extract/download_extract_tutorial.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/download-and-extract/download_extract_tutorial.ipynb -------------------------------------------------------------------------------- /tutorials/text/gliner-pii-redaction/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/gliner-pii-redaction/README.md -------------------------------------------------------------------------------- /tutorials/text/gliner-pii-redaction/gliner_pii_redaction.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/gliner-pii-redaction/gliner_pii_redaction.ipynb -------------------------------------------------------------------------------- /tutorials/text/gliner-pii-redaction/gliner_pii_redactor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/gliner-pii-redaction/gliner_pii_redactor.py -------------------------------------------------------------------------------- /tutorials/text/llama-nemotron-data-curation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/llama-nemotron-data-curation/README.md -------------------------------------------------------------------------------- /tutorials/text/llama-nemotron-data-curation/filters/heuristic_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/llama-nemotron-data-curation/filters/heuristic_filters.py -------------------------------------------------------------------------------- /tutorials/text/llama-nemotron-data-curation/filters/model_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/llama-nemotron-data-curation/filters/model_filters.py -------------------------------------------------------------------------------- /tutorials/text/llama-nemotron-data-curation/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/llama-nemotron-data-curation/main.py -------------------------------------------------------------------------------- /tutorials/text/llama-nemotron-data-curation/utils/jsonl_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/llama-nemotron-data-curation/utils/jsonl_utils.py -------------------------------------------------------------------------------- /tutorials/text/peft-curation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/peft-curation/README.md -------------------------------------------------------------------------------- /tutorials/text/peft-curation/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/peft-curation/main.py -------------------------------------------------------------------------------- /tutorials/text/peft-curation/stages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/peft-curation/stages.py -------------------------------------------------------------------------------- /tutorials/text/tinystories/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/tinystories/README.md -------------------------------------------------------------------------------- /tutorials/text/tinystories/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/tinystories/main.py -------------------------------------------------------------------------------- /tutorials/text/tinystories/stages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/text/tinystories/stages.py -------------------------------------------------------------------------------- /tutorials/video/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/video/README.md -------------------------------------------------------------------------------- /tutorials/video/getting-started/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/video/getting-started/README.md -------------------------------------------------------------------------------- /tutorials/video/getting-started/video_split_clip_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/tutorials/video/getting-started/video_split_clip_example.py -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-NeMo/Curator/HEAD/uv.lock --------------------------------------------------------------------------------