├── .devcontainer
    ├── Dockerfile
    ├── README.md
    └── devcontainer.json
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report_form.yml
    │   ├── config.yml
    │   ├── documentation_request_correction.yml
    │   ├── documentation_request_new.yml
    │   └── feature_request_form.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── copy-pr-bot.yaml
    └── workflows
    │   ├── build-docs.yml
    │   ├── conda-publish.yml
    │   ├── docker-build.yml
    │   ├── docker-nightly-publish.yml
    │   ├── docker-release-publish.yml
    │   ├── pre-commit.yml
    │   ├── pypi-nightly-publish.yml
    │   └── test-library-mode.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CITATION.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── SECURITY.md
├── api
    ├── LICENSE
    ├── MANIFEST.in
    ├── README.md
    ├── api_tests
    │   ├── __init__.py
    │   ├── import_checks.py
    │   ├── interface
    │   │   ├── __init__.py
    │   │   ├── test_extract.py
    │   │   ├── test_interface.py
    │   │   ├── test_mutate.py
    │   │   ├── test_transform.py
    │   │   └── test_utility.py
    │   ├── internal
    │   │   ├── __init__.py
    │   │   ├── extract
    │   │   │   ├── __init__.py
    │   │   │   ├── audio
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_audio_extraction.py
    │   │   │   ├── docx
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_docx_extractor.py
    │   │   │   └── image
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_chart_extractor.py
    │   │   │   │   ├── test_image_extractor.py
    │   │   │   │   ├── test_infographic_extractor.py
    │   │   │   │   └── test_table_extractor.py
    │   │   ├── mutate
    │   │   │   ├── __init__.py
    │   │   │   ├── test_deduplicate_images.py
    │   │   │   └── test_filter_images.py
    │   │   └── test_enums.py
    │   ├── primitives
    │   │   ├── __init__.py
    │   │   ├── nim
    │   │   │   ├── __init__.py
    │   │   │   └── model_interface
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_decorators.py
    │   │   │   │   ├── test_helpers.py
    │   │   │   │   ├── test_nemoretriever_parse.py
    │   │   │   │   ├── test_paddle.py
    │   │   │   │   ├── test_parakeet.py
    │   │   │   │   ├── test_text_embedding.py
    │   │   │   │   ├── test_vlm.py
    │   │   │   │   ├── test_yolox_interface_base.py
    │   │   │   │   ├── test_yolox_interface_graphic_elements.py
    │   │   │   │   ├── test_yolox_interface_page_elements.py
    │   │   │   │   ├── test_yolox_interface_table_structure.py
    │   │   │   │   └── test_yolox_utilities.py
    │   │   ├── test_ingest_control_message.py
    │   │   ├── test_ingest_control_message_task.py
    │   │   └── tracing
    │   │   │   ├── __init__.py
    │   │   │   ├── test_latency.py
    │   │   │   └── test_tagging.py
    │   ├── smoke_test.sh
    │   ├── util
    │   │   ├── __init__.py
    │   │   ├── converters
    │   │   │   ├── __init__.py
    │   │   │   ├── multimodal_test_raw_results.json
    │   │   │   ├── test_bytetools.py
    │   │   │   ├── test_containers.py
    │   │   │   ├── test_datetools.py
    │   │   │   ├── test_formats.py
    │   │   │   └── test_type_mappings.py
    │   │   ├── detectors
    │   │   │   ├── __init__.py
    │   │   │   └── test_language.py
    │   │   ├── exception_handlers
    │   │   │   ├── __init__.py
    │   │   │   ├── test_converters.py
    │   │   │   ├── test_decorators.py
    │   │   │   ├── test_detectors.py
    │   │   │   ├── test_pdf.py
    │   │   │   └── test_schemas.py
    │   │   ├── image_processing
    │   │   │   ├── __init__.py
    │   │   │   ├── test_clustering.py
    │   │   │   └── test_transforms.py
    │   │   ├── logging
    │   │   │   ├── __init__.py
    │   │   │   └── test_configuration.py
    │   │   ├── message_brokers
    │   │   │   ├── __init__.py
    │   │   │   ├── redis
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_redis_client.py
    │   │   │   └── simple_message_broker
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_ordered_message_queue.py
    │   │   │   │   ├── test_simple_client.py
    │   │   │   │   └── test_simple_message_broker.py
    │   │   ├── metadata
    │   │   │   ├── __init__.py
    │   │   │   └── test_metadata_aggregators.py
    │   │   └── schema
    │   │   │   ├── __init__.py
    │   │   │   └── test_schema_validator.py
    │   └── utilities_for_test.py
    ├── pyproject.toml
    └── src
    │   ├── nv_ingest_api
    │       ├── __init__.py
    │       ├── interface
    │       │   ├── __init__.py
    │       │   ├── extract.py
    │       │   ├── mutate.py
    │       │   ├── store.py
    │       │   ├── transform.py
    │       │   └── utility.py
    │       ├── internal
    │       │   ├── __init__.py
    │       │   ├── enums
    │       │   │   ├── __init__.py
    │       │   │   └── common.py
    │       │   ├── extract
    │       │   │   ├── __init__.py
    │       │   │   ├── audio
    │       │   │   │   ├── __init__.py
    │       │   │   │   └── audio_extraction.py
    │       │   │   ├── docx
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── docx_extractor.py
    │       │   │   │   └── engines
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   └── docxreader_helpers
    │       │   │   │   │       ├── __init__.py
    │       │   │   │   │       ├── docx_helper.py
    │       │   │   │   │       └── docxreader.py
    │       │   │   ├── image
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── chart_extractor.py
    │       │   │   │   ├── image_extractor.py
    │       │   │   │   ├── image_helpers
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   └── common.py
    │       │   │   │   ├── infographic_extractor.py
    │       │   │   │   └── table_extractor.py
    │       │   │   ├── pdf
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── engines
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   ├── adobe.py
    │       │   │   │   │   ├── llama.py
    │       │   │   │   │   ├── nemoretriever.py
    │       │   │   │   │   ├── pdf_helpers
    │       │   │   │   │   │   └── __init__.py
    │       │   │   │   │   ├── pdfium.py
    │       │   │   │   │   ├── tika.py
    │       │   │   │   │   └── unstructured_io.py
    │       │   │   │   └── pdf_extractor.py
    │       │   │   └── pptx
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── engines
    │       │   │   │       ├── __init__.py
    │       │   │   │       └── pptx_helper.py
    │       │   │   │   └── pptx_extractor.py
    │       │   ├── mutate
    │       │   │   ├── __init__.py
    │       │   │   ├── deduplicate.py
    │       │   │   └── filter.py
    │       │   ├── primitives
    │       │   │   ├── __init__.py
    │       │   │   ├── control_message_task.py
    │       │   │   ├── ingest_control_message.py
    │       │   │   ├── nim
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── default_values.py
    │       │   │   │   ├── model_interface
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   ├── cached.py
    │       │   │   │   │   ├── decorators.py
    │       │   │   │   │   ├── deplot.py
    │       │   │   │   │   ├── helpers.py
    │       │   │   │   │   ├── nemoretriever_parse.py
    │       │   │   │   │   ├── paddle.py
    │       │   │   │   │   ├── parakeet.py
    │       │   │   │   │   ├── text_embedding.py
    │       │   │   │   │   ├── vlm.py
    │       │   │   │   │   └── yolox.py
    │       │   │   │   ├── nim_client.py
    │       │   │   │   └── nim_model_interface.py
    │       │   │   └── tracing
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── latency.py
    │       │   │   │   ├── logging.py
    │       │   │   │   └── tagging.py
    │       │   ├── schemas
    │       │   │   ├── __init__.py
    │       │   │   ├── extract
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── extract_audio_schema.py
    │       │   │   │   ├── extract_chart_schema.py
    │       │   │   │   ├── extract_docx_schema.py
    │       │   │   │   ├── extract_image_schema.py
    │       │   │   │   ├── extract_infographic_schema.py
    │       │   │   │   ├── extract_pdf_schema.py
    │       │   │   │   ├── extract_pptx_schema.py
    │       │   │   │   └── extract_table_schema.py
    │       │   │   ├── message_brokers
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── message_broker_client_schema.py
    │       │   │   │   ├── request_schema.py
    │       │   │   │   └── response_schema.py
    │       │   │   ├── meta
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── base_model_noext.py
    │       │   │   │   ├── ingest_job_schema.py
    │       │   │   │   └── metadata_schema.py
    │       │   │   ├── mutate
    │       │   │   │   ├── __init__.py
    │       │   │   │   └── mutate_image_dedup_schema.py
    │       │   │   ├── store
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── store_embedding_schema.py
    │       │   │   │   └── store_image_schema.py
    │       │   │   └── transform
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── transform_image_caption_schema.py
    │       │   │   │   ├── transform_image_filter_schema.py
    │       │   │   │   ├── transform_text_embedding_schema.py
    │       │   │   │   └── transform_text_splitter_schema.py
    │       │   ├── store
    │       │   │   ├── __init__.py
    │       │   │   ├── embed_text_upload.py
    │       │   │   └── image_upload.py
    │       │   └── transform
    │       │   │   ├── __init__.py
    │       │   │   ├── caption_image.py
    │       │   │   ├── embed_text.py
    │       │   │   └── split_text.py
    │       └── util
    │       │   ├── __init__.py
    │       │   ├── control_message
    │       │       ├── __init__.py
    │       │       └── validators.py
    │       │   ├── converters
    │       │       ├── __init__.py
    │       │       ├── bytetools.py
    │       │       ├── containers.py
    │       │       ├── datetools.py
    │       │       ├── dftools.py
    │       │       ├── formats.py
    │       │       └── type_mappings.py
    │       │   ├── detectors
    │       │       ├── __init__.py
    │       │       └── language.py
    │       │   ├── exception_handlers
    │       │       ├── __init__.py
    │       │       ├── converters.py
    │       │       ├── decorators.py
    │       │       ├── detectors.py
    │       │       ├── pdf.py
    │       │       └── schemas.py
    │       │   ├── image_processing
    │       │       ├── __init__.py
    │       │       ├── clustering.py
    │       │       ├── processing.py
    │       │       ├── table_and_chart.py
    │       │       └── transforms.py
    │       │   ├── logging
    │       │       ├── __init__.py
    │       │       └── configuration.py
    │       │   ├── message_brokers
    │       │       ├── __init__.py
    │       │       └── simple_message_broker
    │       │       │   ├── __init__.py
    │       │       │   ├── broker.py
    │       │       │   ├── ordered_message_queue.py
    │       │       │   └── simple_client.py
    │       │   ├── metadata
    │       │       ├── __init__.py
    │       │       └── aggregators.py
    │       │   ├── multi_processing
    │       │       ├── __init__.py
    │       │       └── mp_pool_singleton.py
    │       │   ├── nim
    │       │       └── __init__.py
    │       │   ├── pdf
    │       │       ├── __init__.py
    │       │       └── pdfium.py
    │       │   ├── schema
    │       │       ├── __init__.py
    │       │       └── schema_validator.py
    │       │   ├── service_clients
    │       │       ├── __init__.py
    │       │       ├── client_base.py
    │       │       ├── kafka
    │       │       │   └── __init__.py
    │       │       ├── redis
    │       │       │   ├── __init__.py
    │       │       │   └── redis_client.py
    │       │       └── rest
    │       │       │   ├── __init__.py
    │       │       │   └── rest_client.py
    │       │   └── string_processing
    │       │       └── __init__.py
    │   └── version.py
├── ci
    ├── data
    │   ├── pdf_20_chart_bbox.csv
    │   ├── pdf_20_chart_text_output.csv
    │   ├── pdf_20_table_bbox.csv
    │   └── pdf_20_table_paddleOCR.csv
    └── scripts
    │   ├── bo20_validate.py
    │   └── build_pip_packages.sh
├── client
    ├── LICENSE
    ├── MANIFEST.in
    ├── README.md
    ├── client_examples
    │   ├── README.md
    │   ├── docker
    │   │   ├── Dockerfile.client
    │   │   ├── entrypoint.sh
    │   │   └── start-jupyter.sh
    │   └── examples
    │   │   ├── cli_client_usage.ipynb
    │   │   └── python_client_usage.ipynb
    ├── client_tests
    │   ├── __init__.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── test_nv_ingest_cli.py
    │   │   └── util
    │   │   │   ├── __init__.py
    │   │   │   ├── test_click.py
    │   │   │   ├── test_processing.py
    │   │   │   └── test_system.py
    │   ├── client
    │   │   ├── __init__.py
    │   │   ├── test_client.py
    │   │   ├── test_interface.py
    │   │   └── test_rest_client.py
    │   ├── primitives
    │   │   ├── __init__.py
    │   │   ├── jobs
    │   │   │   ├── __init__.py
    │   │   │   ├── test_job_spec.py
    │   │   │   └── test_job_state.py
    │   │   └── tasks
    │   │   │   ├── __init__.py
    │   │   │   ├── test_audio_extraction.py
    │   │   │   ├── test_caption.py
    │   │   │   ├── test_dedup.py
    │   │   │   ├── test_embed.py
    │   │   │   ├── test_extract.py
    │   │   │   ├── test_filter.py
    │   │   │   ├── test_split.py
    │   │   │   ├── test_store.py
    │   │   │   ├── test_store_embed.py
    │   │   │   ├── test_task_base.py
    │   │   │   └── test_task_factory.py
    │   └── util
    │   │   ├── file_processing
    │   │       ├── __init__.py
    │   │       └── test_extract.py
    │   │   ├── test_dataset.py
    │   │   ├── test_milvus_util.py
    │   │   └── test_util.py
    ├── pyproject.toml
    └── src
    │   ├── nv_ingest_client
    │       ├── __init__.py
    │       ├── cli
    │       │   ├── __init__.py
    │       │   └── util
    │       │   │   ├── __init__.py
    │       │   │   ├── click.py
    │       │   │   ├── processing.py
    │       │   │   ├── system.py
    │       │   │   └── tasks.py
    │       ├── client
    │       │   ├── __init__.py
    │       │   ├── client.py
    │       │   └── interface.py
    │       ├── nv_ingest_cli.py
    │       ├── primitives
    │       │   ├── __init__.py
    │       │   ├── exceptions.py
    │       │   ├── jobs
    │       │   │   ├── __init__.py
    │       │   │   ├── job_spec.py
    │       │   │   └── job_state.py
    │       │   └── tasks
    │       │   │   ├── __init__.py
    │       │   │   ├── audio_extraction.py
    │       │   │   ├── caption.py
    │       │   │   ├── chart_extraction.py
    │       │   │   ├── dedup.py
    │       │   │   ├── embed.py
    │       │   │   ├── extract.py
    │       │   │   ├── filter.py
    │       │   │   ├── infographic_extraction.py
    │       │   │   ├── split.py
    │       │   │   ├── store.py
    │       │   │   ├── table_extraction.py
    │       │   │   ├── task_base.py
    │       │   │   ├── task_factory.py
    │       │   │   ├── transform.py
    │       │   │   └── vdb_upload.py
    │       └── util
    │       │   ├── __init__.py
    │       │   ├── dataset.py
    │       │   ├── file_processing
    │       │       ├── __init__.py
    │       │       └── extract.py
    │       │   ├── milvus.py
    │       │   ├── process_json_files.py
    │       │   ├── processing.py
    │       │   ├── util.py
    │       │   └── zipkin.py
    │   └── version.py
├── conda
    ├── build_conda_packages.sh
    ├── environments
    │   ├── nv_ingest_api_environment.yml
    │   ├── nv_ingest_client_environment.yml
    │   └── nv_ingest_environment.yml
    ├── packages
    │   ├── nv_ingest
    │   │   └── meta.yaml
    │   ├── nv_ingest_api
    │   │   └── meta.yaml
    │   └── nv_ingest_client
    │   │   └── meta.yaml
    └── scripts
    │   └── helper_functions.sh
├── config
    ├── otel-collector-config.yaml
    └── prometheus.yaml
├── data
    ├── chart.png
    ├── charts_with_page_num_fixed.csv
    ├── embedded_table.pdf
    ├── functional_validation.json
    ├── functional_validation.pdf
    ├── multimodal_test.bmp
    ├── multimodal_test.docx
    ├── multimodal_test.jpeg
    ├── multimodal_test.json
    ├── multimodal_test.pdf
    ├── multimodal_test.png
    ├── multimodal_test.pptx
    ├── multimodal_test.svg
    ├── multimodal_test.tiff
    ├── multimodal_test.wav
    ├── table.png
    ├── table_queries_cleaned_235.csv
    ├── table_test.pdf
    ├── test-page-form.pdf
    ├── test-shapes.pdf
    ├── test.pdf
    ├── text_query_answer_gt_page.csv
    ├── woods_frost.docx
    └── woods_frost.pdf
├── deploy
    └── pdf-blueprint.ipynb
├── docker-compose.yaml
├── docker
    └── scripts
    │   ├── entrypoint.sh
    │   ├── entrypoint_devcontainer.sh
    │   ├── entrypoint_source_ext.sh
    │   └── post_build_triggers.py
├── docs
    ├── Dockerfile
    ├── Makefile
    ├── docs
    │   ├── assets
    │   │   └── css
    │   │   │   ├── color-schemes.css
    │   │   │   ├── custom-material.css
    │   │   │   ├── fonts.css
    │   │   │   └── jupyter-themes.css
    │   ├── extraction
    │   │   ├── audio.md
    │   │   ├── content-metadata.md
    │   │   ├── contributing.md
    │   │   ├── data-store.md
    │   │   ├── environment-config.md
    │   │   ├── example_processed_docs
    │   │   │   └── text
    │   │   │   │   └── multimodal_test.pdf.metadata.json
    │   │   ├── faq.md
    │   │   ├── helm.md
    │   │   ├── images
    │   │   │   ├── audio.png
    │   │   │   ├── generate_personal_key.png
    │   │   │   ├── image_viewer_example.png
    │   │   │   ├── overview-extraction.png
    │   │   │   ├── overview-retriever.png
    │   │   │   ├── preview-image.png
    │   │   │   ├── prometheus.png
    │   │   │   ├── test.pdf.png
    │   │   │   └── zipkin.png
    │   │   ├── nemoretriever-parse.md
    │   │   ├── ngc-api-key.md
    │   │   ├── notebooks.md
    │   │   ├── nv-ingest-python-api.md
    │   │   ├── nv-ingest_cli.md
    │   │   ├── overview.md
    │   │   ├── prerequisites.md
    │   │   ├── quickstart-guide.md
    │   │   ├── quickstart-library-mode.md
    │   │   ├── releasenotes-nv-ingest.md
    │   │   ├── support-matrix.md
    │   │   └── telemetry.md
    │   └── overview.md
    ├── mkdocs.yml
    ├── overrides
    │   ├── .icons
    │   │   └── nvidia
    │   │   │   └── nvidia-logo.svg
    │   └── main.html
    ├── requirements.txt
    ├── scripts
    │   └── generate_openapi_docs.py
    └── sphinx_docs
    │   └── source
    │       ├── conf.py
    │       ├── index.rst
    │       └── openapi.rst
├── evaluation
    ├── bo767_recall.ipynb
    └── digital_corpora_download.ipynb
├── examples
    ├── langchain_multimodal_rag.ipynb
    ├── launch_libmode_and_run_ingestor.py
    ├── launch_libmode_service.py
    ├── llama_index_multimodal_rag.ipynb
    ├── metadata_and_filtered_search.ipynb
    └── reindex_example.ipynb
├── helm
    ├── .helmignore
    ├── CHANGELOG.md
    ├── Chart.lock
    ├── Chart.yaml
    ├── LICENSE
    ├── README.md
    ├── README.md.gotmpl
    ├── mig
    │   └── nv-ingest-mig-config.yaml
    ├── templates
    │   ├── NOTES.txt
    │   ├── _helpers.tpl
    │   ├── configmap.yaml
    │   ├── deployment.yaml
    │   ├── hpa.yaml
    │   ├── ingress.yaml
    │   ├── secrets.yaml
    │   ├── service.yaml
    │   └── serviceaccount.yaml
    ├── time-slicing
    │   └── time-slicing-config.yaml
    ├── update_helm_readme.sh
    └── values.yaml
├── print_env.sh
├── pyproject.toml
├── pytest.ini
├── setup.cfg
├── setup.py
├── skaffold
    ├── README.md
    ├── nv-ingest.skaffold.yaml
    └── sensitive
    │   └── .gitignore
├── src
    ├── ingest_pipeline_config.json
    ├── microservice_entrypoint.py
    ├── nv_ingest
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── main.py
    │   │   └── v1
    │   │   │   ├── __init__.py
    │   │   │   ├── health.py
    │   │   │   └── ingest.py
    │   └── framework
    │   │   ├── __init__.py
    │   │   ├── orchestration
    │   │       ├── __init__.py
    │   │       └── morpheus
    │   │       │   ├── __init__.py
    │   │       │   ├── modules
    │   │       │       ├── __init__.py
    │   │       │       ├── injectors
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── metadata_injector.py
    │   │       │       │   └── task_injection.py
    │   │       │       ├── sinks
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── message_broker_task_sink.py
    │   │       │       │   └── vdb_task_sink.py
    │   │       │       ├── sources
    │   │       │       │   ├── __init__.py
    │   │       │       │   └── message_broker_task_source.py
    │   │       │       ├── storages
    │   │       │       │   ├── __init__.py
    │   │       │       │   └── image_storage.py
    │   │       │       ├── telemetry
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── job_counter.py
    │   │       │       │   ├── otel_meter.py
    │   │       │       │   └── otel_tracer.py
    │   │       │       └── transforms
    │   │       │       │   ├── __init__.py
    │   │       │       │   └── text_splitter.py
    │   │       │   ├── stages
    │   │       │       ├── __init__.py
    │   │       │       ├── extractors
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── audio_extraction_stage.py
    │   │       │       │   ├── chart_extraction_stage.py
    │   │       │       │   ├── docx_extractor_stage.py
    │   │       │       │   ├── image_extractor_stage.py
    │   │       │       │   ├── infographic_extraction_stage.py
    │   │       │       │   ├── pdf_extractor_stage.py
    │   │       │       │   ├── pptx_extractor_stage.py
    │   │       │       │   └── table_extraction_stage.py
    │   │       │       ├── meta
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── linear_module_source_stage_cpu.py
    │   │       │       │   └── multiprocessing_stage.py
    │   │       │       ├── mutate
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── image_dedup.py
    │   │       │       │   └── image_filter.py
    │   │       │       ├── store
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── embedding_storage_stage.py
    │   │       │       │   └── image_storage_stage.py
    │   │       │       └── transforms
    │   │       │       │   ├── __init__.py
    │   │       │       │   ├── embed_text_stage.py
    │   │       │       │   └── image_caption_extraction.py
    │   │       │   └── util
    │   │       │       ├── __init__.py
    │   │       │       ├── modules
    │   │       │           ├── __init__.py
    │   │       │           └── config_validator.py
    │   │       │       └── pipeline
    │   │       │           ├── __init__.py
    │   │       │           ├── logging.py
    │   │       │           ├── pipeline_builders.py
    │   │       │           ├── pipeline_runners.py
    │   │       │           └── stage_builders.py
    │   │   ├── schemas
    │   │       ├── __init__.py
    │   │       ├── framework_ingest_config_schema.py
    │   │       ├── framework_job_counter_schema.py
    │   │       ├── framework_message_broker_sink_schema.py
    │   │       ├── framework_message_broker_source_schema.py
    │   │       ├── framework_message_wrapper_schema.py
    │   │       ├── framework_metadata_injector_schema.py
    │   │       ├── framework_otel_meter_schema.py
    │   │       ├── framework_otel_tracer_schema.py
    │   │       ├── framework_processing_job_schema.py
    │   │       ├── framework_task_injection_schema.py
    │   │       └── framework_vdb_task_sink_schema.py
    │   │   └── util
    │   │       ├── __init__.py
    │   │       ├── flow_control
    │   │           ├── __init__.py
    │   │           └── filter_by_task.py
    │   │       ├── service
    │   │           ├── __init__.py
    │   │           ├── impl
    │   │           │   ├── __init__.py
    │   │           │   └── ingest
    │   │           │   │   ├── __init__.py
    │   │           │   │   └── redis_ingest_service.py
    │   │           └── meta
    │   │           │   ├── __init__.py
    │   │           │   └── ingest
    │   │           │       ├── __init__.py
    │   │           │       └── ingest_service_meta.py
    │   │       └── telemetry
    │   │           ├── __init__.py
    │   │           └── global_stats.py
    └── util
    │   ├── centroid_testing.py
    │   ├── gen_dataset.py
    │   ├── image_model_validation
    │       ├── __init__.py
    │       ├── cached.py
    │       ├── deplot.py
    │       ├── paddle.py
    │       └── util.py
    │   ├── image_viewer.py
    │   ├── mp_pool_test.py
    │   ├── ray_pool_test.py
    │   └── trt_converters.py
└── tests
    ├── __init__.py
    ├── functional
        ├── __init__.py
        └── test_ingest_pipeline.py
    ├── import_checks.py
    ├── integration
        ├── conftest.py
        ├── test_examples.py
        ├── test_extract_audio.py
        ├── test_extract_docx.py
        ├── test_extract_images.py
        ├── test_extract_pdf.py
        ├── test_extract_pptx.py
        └── utilities_for_test.py
    ├── service_tests
        ├── __init__.py
        ├── modules
        │   ├── __init__.py
        │   ├── injectors
        │   │   ├── __init__.py
        │   │   ├── test_metadata_injection.py
        │   │   └── test_task_injector.py
        │   ├── sinks
        │   │   └── __init__.py
        │   ├── sources
        │   │   ├── __init__.py
        │   │   └── test_message_broker_task_source.py
        │   ├── storages
        │   │   ├── __init__.py
        │   │   └── test_image_storage.py
        │   └── telemetry
        │   │   ├── __init__.py
        │   │   └── test_otel_tracer.py
        ├── schemas
        │   ├── __init__.py
        │   ├── test_audio_extractor_schema.py
        │   ├── test_chart_extractor_schema.py
        │   ├── test_image_caption_extraction_schema.py
        │   ├── test_image_dedup_schema.py
        │   ├── test_image_extrator_schema.py
        │   ├── test_image_filter_schema.py
        │   ├── test_ingest_metadata.py
        │   ├── test_injection_schema.py
        │   ├── test_job_counter_schema.py
        │   ├── test_metadata_injector_schema.py
        │   ├── test_metadata_schema.py
        │   ├── test_otel_meter_schema.py
        │   ├── test_otel_tracer_schema.py
        │   ├── test_redis_client_schema.py
        │   ├── test_redis_task_sink_schema.py
        │   ├── test_redis_task_source_schema.py
        │   ├── test_table_extractor_schema.py
        │   └── test_text_splitter_schema.py
        ├── stages
        │   └── __init__.py
        └── util
        │   ├── __init__.py
        │   ├── flow_control
        │       ├── __init__.py
        │       └── test_filter_by_task.py
        │   ├── modules
        │       ├── __init__.py
        │       └── test_config_validator.py
        │   └── telemetry
        │       ├── __init__.py
        │       └── test_global_stats.py
    └── utilities_for_test.py


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | # syntax=docker/dockerfile:1.3
 5 | 
 6 | ARG BASE_IMG=nvcr.io/nvidia/cuda
 7 | ARG BASE_IMG_TAG=12.4.1-base-ubuntu22.04
 8 | 
 9 | # Use NVIDIA cuda
10 | FROM $BASE_IMG:$BASE_IMG_TAG AS base
11 | 
12 | ARG RELEASE_TYPE="dev"
13 | ARG VERSION=""
14 | ARG VERSION_REV="0"
15 | 
16 | # Install necessary dependencies using apt-get
17 | RUN apt-get update && apt-get install -y \
18 |       wget \
19 |       bzip2 \
20 |       ca-certificates \
21 |       curl \
22 |       libgl1-mesa-glx \
23 |       vim \
24 |       git \
25 |     && apt-get clean
26 | 
27 | RUN wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O /tmp/miniforge.sh \
28 |     && bash /tmp/miniforge.sh -b -p /opt/conda \
29 |     && rm /tmp/miniforge.sh
30 | 
31 | # Add conda to the PATH
32 | ENV PATH=/opt/conda/bin:$PATH
33 | 
34 | # Install Mamba, a faster alternative to conda, within the base environment
35 | RUN conda install -y mamba -n base -c conda-forge
36 | 
37 | COPY conda/environments/nv_ingest_environment.yml /workspace/nv_ingest_environment.yml
38 | 
39 | # Create nv_ingest base environment
40 | RUN mamba env create -f /workspace/nv_ingest_environment.yml \
41 |     && conda clean --all --yes
42 | 
43 | # Set default shell to bash
44 | SHELL ["/bin/bash", "-c"]
45 | 
46 | # Activate the environment (make it default for subsequent commands)
47 | RUN echo "source activate nv_ingest_runtime" >> ~/.bashrc
48 | 


--------------------------------------------------------------------------------
/.devcontainer/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 | http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | -->
17 | 
18 | # NV-Ingest Devcontainer
19 | 
20 | The nv-ingest devcontainer is provided as a quick-to-set-up development and exploration environment for use with [Visual Studio Code](https://code.visualstudio.com) (Code). The devcontainer is a lightweight container which mounts-in a Conda environment with cached packages, alleviating long Conda download times on subsequent launches. It provides a simple framework for adding developer-centric [scripts](#development-scripts), and incorporates some helpful Code plugins.
21 | 
22 | > [!Note]
23 | > NV-Ingest is also known as NVIDIA Ingest and NeMo Retriever extraction.
24 | 
25 | More information about devcontainers can be found at [`containers.dev`](https://containers.dev/).
26 | 
27 | ## Getting Started
28 | 
29 | To get started, simply open the nv-ingest repository root folder within Code. A window should appear at the bottom-right corner of the editor asking if you would like to reopen the workspace inside of the dev container. After clicking the confirmation dialog, the container will first build, then launch, then remote-attach.
30 | 
31 | If the window does not appear, or you would like to rebuild the container, click ctrl-shift-p and search for `Dev Containers: Rebuild and Reopen in Container`. Hit enter, and the container will first build, then launch, then remote-attach.
32 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # Documentation and examples
 2 | docs/                    @NVIDIA/nv-ingest-docs
 3 | README.md                @NVIDIA/nv-ingest-docs
 4 | examples/                @NVIDIA/nv-ingest-docs
 5 | 
 6 | # Devops
 7 | .devcontainer/           @NVIDIA/nv-ingest-ops
 8 | .github/                 @NVIDIA/nv-ingest-ops
 9 | .ci/                     @NVIDIA/nv-ingest-ops
10 | 
11 | # Global owners (required for all PRs)
12 | *                        @NVIDIA/nv-ingest-maintainers
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | # GitHub info on config.yml
 2 | # https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository#configuring-the-template-chooser
 3 | # Set to 'false' if you only want the templates to be used.
 4 | blank_issues_enabled: true
 5 | 
 6 | # When using discussions instead of Question issue templates,
 7 | # link that below to have it show up in the 'Submit Issue' page
 8 | contact_links:
 9 |   - name: Ask a Question
10 |     url: https://github.com/nvidia/nv-ingest/discussions
11 |     about: Please ask any questions here.
12 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | <!-- Provide a standalone description of changes in this PR. -->
 3 | <!-- Reference any issues closed by this PR with "closes #1234". -->
 4 | <!-- Note: The pull request title will be included in the CHANGELOG. -->
 5 | 
 6 | ## Checklist
 7 | - [ ] I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/nv-ingest/blob/main/CONTRIBUTING.md).
 8 | - [ ] New or existing tests cover these changes.
 9 | - [ ] The documentation is up to date with these changes.
10 | - [ ] If adjusting docker-compose.yaml environment variables have you ensured those are mimicked in the Helm values.yaml file.
11 | 


--------------------------------------------------------------------------------
/.github/copy-pr-bot.yaml:
--------------------------------------------------------------------------------
1 | # Configuration file for `copy-pr-bot` GitHub App
2 | # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
3 | 
4 | enabled: true
5 | 


--------------------------------------------------------------------------------
/.github/workflows/conda-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Nv-Ingest Conda Package Publish
 2 | 
 3 | # Trigger for pull requests and pushing to main
 4 | on:
 5 |   schedule:
 6 |     # Runs every day at 11:30PM (UTC)
 7 |     - cron: "30 23 * * *"
 8 |   push:
 9 |     branches:
10 |       - main
11 |   workflow_dispatch:
12 |     inputs:
13 |       CONDA_CHANNEL:
14 |         description: 'The RapidsAI Conda channel the package should be pushed to'
15 |         required: true
16 |         type: choice
17 |         options:
18 |           - dev
19 |           - main
20 |       VERSION:
21 |         description: 'Version string for the release (e.g., 1.0.0)'
22 |         required: true
23 | 
24 | jobs:
25 |   build:
26 |     runs-on: linux-large-disk
27 |     container:
28 |       image: rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.10
29 |     steps:
30 |       - name: Checkout code
31 |         uses: actions/checkout@v4
32 |         with:
33 |           ref: main
34 | 
35 |       - name: Determine CONDA_CHANNEL
36 |         run: |
37 |           echo "Github event_name: ${{ github.event_name }}"
38 |           if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
39 |             echo "Setting Conda channel to ${{ github.event.inputs.CONDA_CHANNEL }}"
40 |             echo "CONDA_CHANNEL=${{ github.event.inputs.CONDA_CHANNEL }}" >> $GITHUB_ENV
41 |           else
42 |             echo "CONDA_CHANNEL=dev" >> $GITHUB_ENV
43 |           fi
44 | 
45 |       # Build the Conda packages
46 |       - name: Build Conda Packages
47 |         run: |
48 |           echo "Github event_name: ${{ github.event_name }}"
49 |           if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
50 |             echo "Building conda package for ${{ github.event.inputs.VERSION }}"
51 |             RELEASE_VERSION="${{ github.event.inputs.VERSION }}" ./conda/build_conda_packages.sh
52 |           else
53 |             ./conda/build_conda_packages.sh
54 |           fi
55 | 
56 |       # Publish nv-ingest conda packages
57 |       - name: Publish conda package
58 |         run: anaconda -t "${{ secrets.NVIDIA_CONDA_TOKEN }}" upload --force --label $CONDA_CHANNEL ./conda/output_conda_channel/linux-64/*.conda
59 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
 1 | name: Build NV-Ingest Runtime Image
 2 | 
 3 | # Trigger for pull requests and pushing to main
 4 | on:
 5 |   pull_request:
 6 |     types:
 7 |       - opened
 8 |       - synchronize
 9 |       - reopened
10 |   push:
11 |     branches:
12 |       - main
13 | 
14 |   # Allows you to run this workflow manually from the Actions tab
15 |   workflow_dispatch:
16 | 
17 | jobs:
18 |   build:
19 |     runs-on: linux-large-disk
20 | 
21 |     steps:
22 |       - name: Checkout code
23 |         uses: actions/checkout@v4
24 |         with:
25 |           ref: main
26 | 
27 |       - name: Get current date (yyyy.mm.dd)
28 |         run: echo "CURRENT_DATE=$(date +'%Y.%m.%d')" >> $GITHUB_ENV
29 | 
30 |       # Set up Docker Buildx, useful for building multi-platform images
31 |       - name: Set up Docker Buildx
32 |         uses: docker/setup-buildx-action@v3
33 | 
34 |       # Build the Docker image using the Dockerfile
35 |       - name: Build Docker image
36 |         run: |
37 |           docker build --target runtime --build-arg GIT_COMMIT=${GITHUB_SHA} -t nv-ingest:latest .
38 | 
39 |       - name: Run Pytest inside Docker container
40 |         run: |
41 |           docker run nv-ingest:latest pytest -rs -m "not integration" --cov nv_ingest --cov nv_ingest_client --cov nv_ingest_api --cov-report term --cov-report xml:coverage.xml tests/service_tests client/client_tests api/api_tests
42 | 
43 |       - name: Upload test report
44 |         uses: actions/upload-artifact@v4
45 |         with:
46 |           name: pytest-report
47 |           path: coverage.xml
48 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-nightly-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Nv-Ingest Nightly Container Publish
 2 | 
 3 | # Trigger for pull requests and pushing to main
 4 | on:
 5 |   schedule:
 6 |     # Runs every day at 11:30PM (UTC)
 7 |     - cron: "30 23 * * *"
 8 |   push:
 9 |     branches:
10 |       - main
11 |   # Allows you to run this workflow manually from the Actions tab
12 |   workflow_dispatch:
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: linux-large-disk
17 | 
18 |     steps:
19 |       - name: Checkout code
20 |         uses: actions/checkout@v4
21 |         with:
22 |           ref: main
23 | 
24 |       - name: Get current date (yyyy.mm.dd)
25 |         run: echo "CURRENT_DATE=$(date +'%Y.%m.%d')" >> $GITHUB_ENV
26 | 
27 |       # Set up Docker Buildx, useful for building multi-platform images
28 |       - name: Set up Docker Buildx
29 |         uses: docker/setup-buildx-action@v3
30 | 
31 |       # Build the Docker image using the Dockerfile
32 |       - name: Build Docker image
33 |         run: |
34 |           docker build --target runtime --build-arg GIT_COMMIT=${GITHUB_SHA} -t ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.CURRENT_DATE }} .
35 | 
36 |       # Login to NGC
37 |       - name: Log in to NGC Registry
38 |         run: echo "${{ secrets.DOCKER_PASSWORD }}" | docker login nvcr.io --username "\$oauthtoken" --password-stdin
39 | 
40 |       # Push the container to NGC
41 |       - name: Upload nv-ingest container
42 |         run: docker push ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.CURRENT_DATE }}
43 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-release-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Push Docker Image
 2 | 
 3 | on:
 4 |   create:
 5 |     branches:
 6 |       - release/*
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: linux-large-disk
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v4
15 | 
16 |       # Extract branch name after "release/"
17 |       - name: Extract branch name
18 |         id: extract_branch
19 |         run: |
20 |           BRANCH_NAME=${GITHUB_REF#refs/heads/release/}
21 |           echo "SHORT_BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV
22 | 
23 |       # Set up Docker Buildx, useful for building multi-platform images
24 |       - name: Set up Docker Buildx
25 |         uses: docker/setup-buildx-action@v3
26 | 
27 |       # Build the Docker image using the Dockerfile
28 |       - name: Build Docker image
29 |         run: |
30 |           docker build --target runtime --build-arg GIT_COMMIT=${GITHUB_SHA} -t ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.SHORT_BRANCH_NAME }} .
31 | 
32 |       # Login to NGC
33 |       - name: Log in to NGC Registry
34 |         run: echo "${{ secrets.DOCKER_PASSWORD }}" | docker login nvcr.io --username "\$oauthtoken" --password-stdin
35 | 
36 |       # Push the container to NGC
37 |       - name: Upload nv-ingest container
38 |         run: docker push ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.SHORT_BRANCH_NAME }}
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: nv-ingest pre-commit
 2 | 
 3 | on:
 4 |     pull_request:
 5 |     push:
 6 |       branches: [main]
 7 | 
 8 | jobs:
 9 |   lint:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out repository code
13 |         uses: actions/checkout@v4
14 |         with:
15 |           ref: main
16 |       - uses: actions/setup-python@v3
17 |       - uses: pre-commit/action@v3.0.1
18 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-nightly-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Nv-Ingest Nightly PyPi Wheel Publish
 2 | 
 3 | # Trigger for pull requests and pushing to main
 4 | on:
 5 |   schedule:
 6 |     # Runs every day at 11:30PM (UTC)
 7 |     - cron: "30 23 * * *"
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: linux-large-disk
13 |     container:
14 |       image: rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.10
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@v4
18 |         with:
19 |           ref: main
20 | 
21 |       - name: Install build dependencies
22 |         run: |
23 |           pip install build twine
24 | 
25 |       - name: Build nv-ingest-api wheel
26 |         run: |
27 |           cd api && python -m build
28 | 
29 |       - name: Build nv-ingest-client wheel
30 |         run: |
31 |           cd client && python -m build
32 | 
33 |       - name: Publish wheels to Artifactory
34 |         env:
35 |           ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }}
36 |           ARTIFACTORY_USERNAME: ${{ secrets.ARTIFACTORY_USERNAME }}
37 |           ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }}
38 |         run: |
39 |           twine upload --repository-url $ARTIFACTORY_URL -u $ARTIFACTORY_USERNAME -p $ARTIFACTORY_PASSWORD api/dist/* \
40 |           && twine upload --repository-url $ARTIFACTORY_URL -u $ARTIFACTORY_USERNAME -p $ARTIFACTORY_PASSWORD client/dist/*
41 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |         exclude: '(^(docs|data)/|\.md$)'
 7 |     -   id: end-of-file-fixer
 8 |     -   id: check-added-large-files
 9 |         args: [-- maxkb=1500]
10 |     -   id: check-ast
11 |     -   id: debug-statements
12 | 
13 |   - repo: https://github.com/psf/black
14 |     rev: 24.10.0
15 |     hooks:
16 |       - id: black
17 |         args: ["--line-length=120"]
18 | 
19 |   - repo: https://github.com/PyCQA/flake8
20 |     rev: 7.1.1
21 |     hooks:
22 |       - id: flake8
23 |         args: ["--max-line-length=120", "--extend-ignore=E203,E266,F403,F405"]
24 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # NVIDIA Ingest 24.08.0
 2 | 
 3 | ## New Features
 4 | 
 5 | - ...
 6 | 
 7 | ## Improvements
 8 | 
 9 | - ...
10 | 
11 | ## Bug Fixes
12 | 
13 | - ...
14 | 


--------------------------------------------------------------------------------
/CITATION.md:
--------------------------------------------------------------------------------
 1 | # Citation Guide
 2 | 
 3 | ## To Cite NVIDIA Ingest
 4 | If you use NVIDIA Ingest in a publication, please use citations in the following format (BibTeX entry for LaTeX):
 5 | ```tex
 6 | @Manual{,
 7 |   title = {NVIDIA Ingest: An accelerated pipeline for document ingestion},
 8 |   author = {NVIDIA Ingest Development Team},
 9 |   year = {2024},
10 |   url = {https://github.com/NVIDIA/nv-ingest},
11 | }
12 | ```
13 | 
14 | 
15 | ## Sample Citations:
16 | 
17 | Using [RAPIDS](https://rapids.ai/) citations for reference.
18 | 
19 | ### Bringing UMAP Closer to the Speed of Light <br> with GPU Acceleration
20 | ```tex
21 | @misc{
22 |       nolet2020bringing,
23 |       title={Bringing UMAP Closer to the Speed of Light with GPU Acceleration},
24 |       author={Corey J. Nolet, Victor Lafargue, Edward Raff, Thejaswi Nanditale, Tim Oates, John Zedlewski, and Joshua Patterson},
25 |       year={2020},
26 |       eprint={2008.00325},
27 |       archivePrefix={arXiv},
28 |       primaryClass={cs.LG}
29 | }
30 | ```
31 | 
32 | ### Machine Learning in Python: <br> Main developments and technology trends in data science, machine learning, and artificial intelligence
33 | ```tex
34 | @article{
35 |   raschka2020machine,
36 |   title={Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence},
37 |   author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey},
38 |   journal={Information},
39 |   volume={11},
40 |   number={4},
41 |   pages={193},
42 |   year={2020},
43 |   publisher={Multidisciplinary Digital Publishing Institute}
44 | }
45 | ```
46 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Security
 2 | 
 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization.
 4 | 
 5 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.**
 6 | 
 7 | ## Reporting Potential Security Vulnerability in an NVIDIA Product
 8 | 
 9 | To report a potential security vulnerability in any NVIDIA product:
10 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
11 | - E-Mail: psirt@nvidia.com
12 |     - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
13 |     - Please include the following information:
14 |    	 - Product/Driver name and version/branch that contains the vulnerability
15 |      - Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
16 |    	 - Instructions to reproduce the vulnerability
17 |    	 - Proof-of-concept or exploit code
18 |    	 - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability
19 | 
20 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information.
21 | 
22 | ## NVIDIA Product Security
23 | 
24 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
25 | 


--------------------------------------------------------------------------------
/api/MANIFEST.in:
--------------------------------------------------------------------------------
1 | exclude *.egg-info
2 | 
3 | include README.md
4 | include LICENSE
5 | recursive-include src *
6 | global-exclude __pycache__
7 | global-exclude *.pyc
8 | 


--------------------------------------------------------------------------------
/api/README.md:
--------------------------------------------------------------------------------
 1 | # nv-ingest-api
 2 | 
 3 | Provides a common set of
 4 | 
 5 | - Pythonic Objects
 6 | - Common Functions
 7 | - Utilities
 8 | - Core Logic
 9 | 
10 | Implemented in pure Python that can be imported and used directly or used as part of future frameworks and runtimes.
11 | 


--------------------------------------------------------------------------------
/api/api_tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from .utilities_for_test import *
6 | 


--------------------------------------------------------------------------------
/api/api_tests/import_checks.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | def check_morpheus_import():
 7 |     try:
 8 |         import morpheus
 9 | 
10 |         _ = morpheus._version
11 | 
12 |         return True
13 |     except Exception as e:
14 |         print(f"\nError: {e}\n", flush=True)
15 |         return False
16 | 
17 | 
18 | def check_cuda_driver():
19 |     try:
20 |         import cupy
21 | 
22 |         import cudf
23 | 
24 |         _ = cupy.cuda.runtime.driverGetVersion()
25 |         _ = cudf.DataFrame({"a": [1, 2, 3]})
26 |         return True
27 |     except Exception as e:
28 |         print(f"\nError: {e}\n", flush=True)
29 |         return False
30 | 
31 | 
32 | def check_adobe_import():
33 |     try:
34 |         pass
35 | 
36 |         return True
37 |     except ImportError:
38 |         return False
39 | 
40 | 
41 | ADOBE_IMPORT_OK = check_adobe_import()
42 | CUDA_DRIVER_OK = check_cuda_driver()
43 | MORPHEUS_IMPORT_OK = check_morpheus_import()
44 | 


--------------------------------------------------------------------------------
/api/api_tests/interface/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/internal/__init__.py


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/internal/extract/__init__.py


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/docx/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/image/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/image/test_image_extractor.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/image/test_infographic_extractor.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/extract/image/test_table_extractor.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/internal/mutate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/internal/mutate/__init__.py


--------------------------------------------------------------------------------
/api/api_tests/primitives/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/primitives/nim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/primitives/nim/__init__.py


--------------------------------------------------------------------------------
/api/api_tests/primitives/nim/model_interface/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/primitives/tracing/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/converters/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/converters/test_type_mappings.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pytest
 6 | 
 7 | from nv_ingest_api.internal.enums.common import ContentTypeEnum, DocumentTypeEnum
 8 | from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "doc_type, expected_content_type",
13 |     [
14 |         (DocumentTypeEnum.BMP, ContentTypeEnum.IMAGE),
15 |         (DocumentTypeEnum.DOCX, ContentTypeEnum.STRUCTURED),
16 |         (DocumentTypeEnum.HTML, ContentTypeEnum.TEXT),
17 |         (DocumentTypeEnum.JPEG, ContentTypeEnum.IMAGE),
18 |         (DocumentTypeEnum.PDF, ContentTypeEnum.STRUCTURED),
19 |         (DocumentTypeEnum.PNG, ContentTypeEnum.IMAGE),
20 |         (DocumentTypeEnum.PPTX, ContentTypeEnum.STRUCTURED),
21 |         (DocumentTypeEnum.SVG, ContentTypeEnum.IMAGE),
22 |         (DocumentTypeEnum.TXT, ContentTypeEnum.TEXT),
23 |     ],
24 | )
25 | def test_doc_type_to_content_type_valid(doc_type, expected_content_type):
26 |     """
27 |     Test doc_type_to_content_type function with valid document types.
28 |     """
29 |     assert (
30 |         doc_type_to_content_type(doc_type) == expected_content_type
31 |     ), f"doc_type {doc_type} should map to content type {expected_content_type}"
32 | 
33 | 
34 | def test_doc_type_to_content_type_invalid():
35 |     """
36 |     Test doc_type_to_content_type function with an invalid document type.
37 |     """
38 |     invalid_doc_type = "invalid_doc_type"  # Assume this is not a valid DocumentTypeEnum value
39 |     with pytest.raises(KeyError):
40 |         doc_type_to_content_type(invalid_doc_type)
41 | 


--------------------------------------------------------------------------------
/api/api_tests/util/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/detectors/test_language.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pytest
 6 | from langdetect import DetectorFactory
 7 | 
 8 | from nv_ingest_api.util.detectors.language import LanguageEnum
 9 | from nv_ingest_api.util.detectors.language import detect_language
10 | 
11 | # Ensure langdetect produces consistent results
12 | DetectorFactory.seed = 0
13 | 
14 | 
15 | @pytest.mark.parametrize(
16 |     "text, expected_language",
17 |     [
18 |         ("This is an English text.", LanguageEnum.EN),
19 |         ("Este es un texto en español.", LanguageEnum.ES),
20 |         # Add more examples as needed
21 |     ],
22 | )
23 | def test_detect_language_known_languages(text, expected_language):
24 |     """
25 |     Test detect_language function with text in known languages.
26 |     """
27 |     assert detect_language(text) == expected_language
28 | 
29 | 
30 | def test_detect_language_unknown_language():
31 |     """
32 |     Test detect_language function with text in an unknown language or not covered by LanguageEnum.
33 |     """
34 |     unknown_text = "1234"  # Assuming Japanese is not in LanguageEnum
35 |     assert detect_language(unknown_text) == LanguageEnum.UNKNOWN
36 | 
37 | 
38 | @pytest.mark.parametrize(
39 |     "invalid_input",
40 |     [
41 |         123,  # Non-string input
42 |         None,  # NoneType
43 |     ],
44 | )
45 | def test_detect_language_invalid_input(invalid_input):
46 |     """
47 |     Test detect_language function with invalid inputs.
48 |     """
49 |     # Assuming the langdetect_exception_handler decorator returns LanguageEnum.UNKNOWN for invalid inputs
50 |     with pytest.raises(TypeError):
51 |         detect_language(invalid_input)
52 | 


--------------------------------------------------------------------------------
/api/api_tests/util/exception_handlers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/exception_handlers/test_converters.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from datetime import datetime
 6 | from datetime import timedelta
 7 | from datetime import timezone
 8 | 
 9 | from nv_ingest_api.util.converters.datetools import datetools_exception_handler
10 | from nv_ingest_api.util.converters.datetools import remove_tz
11 | 
12 | 
13 | # Example functions to test the decorator
14 | @datetools_exception_handler
15 | def test_func_raises_exception():
16 |     raise ValueError("Test exception")
17 | 
18 | 
19 | @datetools_exception_handler
20 | def test_func_success():
21 |     return "Success"
22 | 
23 | 
24 | def test_datetools_exception_handler_with_exception():
25 |     """
26 |     Test the decorator with a function that raises an exception,
27 |     checking that the returned date is within a few minutes of the current time.
28 |     """
29 |     start_time = remove_tz(datetime.now(timezone.utc))
30 | 
31 |     result = test_func_raises_exception()
32 | 
33 |     # Convert result back to datetime object for comparison
34 |     result_datetime = datetime.fromisoformat(result)
35 | 
36 |     end_time = remove_tz(datetime.now(timezone.utc))
37 | 
38 |     # Check the result is within a reasonable time delta (e.g., a few minutes)
39 |     time_delta = timedelta(minutes=5)
40 | 
41 |     assert (
42 |         start_time - time_delta
43 |     ) <= result_datetime, "The returned datetime should be within a few minutes of the current time"
44 |     assert result_datetime <= (
45 |         end_time + time_delta
46 |     ), "The returned datetime should be within a few minutes of the current time"
47 | 
48 | 
49 | def test_datetools_exception_handler_without_exception():
50 |     """
51 |     Test the decorator with a function that does not raise an exception.
52 |     """
53 |     result = test_func_success()
54 |     assert result == "Success", "Decorator should not interfere with the function's normal execution"
55 | 


--------------------------------------------------------------------------------
/api/api_tests/util/exception_handlers/test_detectors.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from langdetect.lang_detect_exception import LangDetectException
 6 | 
 7 | from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler
 8 | 
 9 | 
10 | # Sample function to be decorated
11 | def sample_func(text):
12 |     return "detected_language"
13 | 
14 | 
15 | # Sample function that raises LangDetectException
16 | def sample_func_raises_exception(text):
17 |     raise LangDetectException("No features in text.")
18 | 
19 | 
20 | # Apply the decorator to test functions
21 | @langdetect_exception_handler
22 | def decorated_sample_func(text):
23 |     return sample_func(text)
24 | 
25 | 
26 | @langdetect_exception_handler
27 | def decorated_func_raises_exception(text):
28 |     return sample_func_raises_exception(text)
29 | 
30 | 
31 | def test_langdetect_exception_handler_success():
32 |     """
33 |     Test that the decorator correctly passes through the return value of the function when no exception is raised.
34 |     """
35 |     result = decorated_sample_func("Test text")
36 |     assert result == "detected_language", "The function should return the detected language."
37 | 


--------------------------------------------------------------------------------
/api/api_tests/util/exception_handlers/test_pdf.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import re
 6 | from unittest.mock import patch
 7 | 
 8 | import pytest
 9 | 
10 | from nv_ingest_api.internal.enums.common import StatusEnum, TaskTypeEnum
11 | from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler, create_exception_tag
12 | 
13 | MODULE_UNDER_TEST = "nv_ingest_api.util.exception_handlers.pdf"
14 | 
15 | 
16 | @pdfium_exception_handler(descriptor="pdfium Error")
17 | def sample_func():
18 |     raise Exception("Sample error")
19 | 
20 | 
21 | @pytest.fixture
22 | def mock_logger():
23 |     with patch(f"{MODULE_UNDER_TEST}.logger") as mock:
24 |         yield mock
25 | 
26 | 
27 | def test_pdfium_exception_handler(mock_logger):
28 |     result = sample_func()
29 |     assert result == [], "The function should return an empty list on exception."
30 |     mock_logger.warning.assert_called_once_with("pdfium Error:sample_func error:Sample error")
31 | 
32 | 
33 | def test_create_exception_tag_with_source_id():
34 |     source_id = "test_id"
35 |     error_message = "test_error"
36 |     result = create_exception_tag(error_message, source_id=source_id)
37 | 
38 |     expected_metadata = {
39 |         "task": TaskTypeEnum.EXTRACT,
40 |         "status": StatusEnum.ERROR,
41 |         "source_id": source_id,
42 |         "error_msg": error_message,
43 |     }
44 | 
45 |     # Assuming validate_schema function works as intended or is mocked accordingly
46 |     assert result[0][0] is None
47 |     assert result[0][1]["error_metadata"] == expected_metadata
48 | 
49 | 
50 | def test_create_exception_tag_without_source_id():
51 |     error_message = "test_error"
52 | 
53 |     with pytest.raises(
54 |         ValueError,
55 |         match=re.escape(
56 |             "1 validation error for MetadataSchema\n"
57 |             "error_metadata.source_id\n  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]"  # noqa: W505, E501
58 |         ),
59 |     ):
60 |         create_exception_tag(error_message)
61 | 


--------------------------------------------------------------------------------
/api/api_tests/util/exception_handlers/test_schemas.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from unittest.mock import patch
 6 | 
 7 | import pytest
 8 | from pydantic import BaseModel
 9 | 
10 | from nv_ingest_api.util.exception_handlers.schemas import schema_exception_handler
11 | 
12 | MODULE_UNDER_TEST = "nv_ingest_api.util.exception_handlers.schemas"
13 | 
14 | 
15 | class SimpleModel(BaseModel):
16 |     name: str
17 | 
18 | 
19 | @schema_exception_handler
20 | def function_success():
21 |     return "Success"
22 | 
23 | 
24 | @schema_exception_handler
25 | def function_fail():
26 |     # Intentionally missing the 'name' field to trigger a ValidationError
27 |     SimpleModel()
28 | 
29 | 
30 | def test_schema_exception_handler_success():
31 |     """
32 |     Test that the decorator does not interfere with the normal execution of a function.
33 |     """
34 |     result = function_success()
35 |     assert result == "Success", "The function should successfully return 'Success'."
36 | 
37 | 
38 | @patch(f"{MODULE_UNDER_TEST}.logger")
39 | def test_schema_exception_handler_with_validation_error(mock_logger):
40 |     """
41 |     Test that the decorator correctly handles a ValidationError and logs the expected message.
42 |     """
43 |     with pytest.raises(ValueError) as exc_info:
44 |         function_fail()
45 | 
46 |     # Verify the correct error message was logged
47 |     expected_error_message = "Invalid configuration: name: Field required"
48 |     mock_logger.error.assert_called_once_with(expected_error_message)
49 | 
50 |     # Verify the ValueError contains the correct message
51 |     assert str(exc_info.value) == expected_error_message, "A ValueError with the correct message should be raised."
52 | 


--------------------------------------------------------------------------------
/api/api_tests/util/image_processing/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/image_processing/test_clustering.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest_api.util.image_processing.clustering import (
 6 |     boxes_are_close_or_overlap,
 7 |     group_bounding_boxes,
 8 |     combine_groups_into_bboxes,
 9 |     remove_superset_bboxes,
10 | )
11 | 
12 | 
13 | def test_boxes_are_close_or_overlap():
14 |     from_box = [0, 0, 10, 10]
15 |     to_box = [15, 15, 25, 25]
16 |     assert not boxes_are_close_or_overlap(from_box, to_box, threshold=1)
17 |     assert boxes_are_close_or_overlap(from_box, to_box, threshold=5)
18 | 
19 | 
20 | def test_group_bounding_boxes():
21 |     boxes = [[0, 0, 10, 10], [10, 10, 20, 20], [100, 100, 110, 110]]
22 |     # The second and third boxes should group together
23 |     groups = group_bounding_boxes(boxes, threshold=2)
24 |     assert len(groups) == 2
25 |     assert sorted(groups[0]) == [0, 1]
26 |     assert sorted(groups[1]) == [2]
27 | 
28 | 
29 | def test_combine_groups_into_bboxes():
30 |     boxes = [[0, 0, 1, 1], [2, 2, 3, 3], [1, 1, 2, 2]]
31 |     groups = [[0], [1, 2]]
32 |     combined = combine_groups_into_bboxes(boxes, groups)
33 |     assert len(combined) == 2
34 |     assert combined[1] == [1, 1, 3, 3]
35 | 
36 | 
37 | def test_remove_superset_bboxes():
38 |     bboxes = [[0, 0, 10, 10], [2, 2, 4, 4], [3, 3, 5, 5]]
39 |     # The first box encloses the second but not the third strictly
40 |     out = remove_superset_bboxes(bboxes)
41 |     assert len(out) == 2
42 |     assert [0, 0, 10, 10] not in out
43 | 


--------------------------------------------------------------------------------
/api/api_tests/util/logging/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/message_brokers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/message_brokers/redis/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/message_brokers/simple_message_broker/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/metadata/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/api_tests/util/schema/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "nv-ingest-api"
 7 | description = "Python module with core document ingestion functions."
 8 | dynamic = ["version"]  # Declare attrs that will be generated at build time
 9 | readme = "README.md"
10 | authors = [
11 |     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
12 | ]
13 | license = {file = "LICENSE"}
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "pandas>=2.0",
21 |     "pydantic>2.0.0",
22 |     "pydantic-settings>2.0.0",
23 | ]
24 | 
25 | [project.urls]
26 | homepage = "https://github.com/NVIDIA/nv-ingest"
27 | repository = "https://github.com/NVIDIA/nv-ingest"
28 | documentation = "https://docs.nvidia.com/nv-ingest"
29 | 
30 | [tool.setuptools.packages.find]
31 | where = ["src"]
32 | 
33 | [tool.setuptools.dynamic]
34 | version = {attr = "version.get_version"}
35 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/enums/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/docx/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | # Copyright (c) 2024, NVIDIA CORPORATION.
6 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/docx/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/extract/docx/engines/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/image/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/pdf/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from .adobe import adobe_extractor
 6 | from .llama import llama_parse_extractor
 7 | from .nemoretriever import nemoretriever_parse_extractor
 8 | from .pdfium import pdfium_extractor
 9 | from .tika import tika_extractor
10 | from .unstructured_io import unstructured_io_extractor
11 | 
12 | __all__ = [
13 |     "adobe_extractor",
14 |     "llama_parse_extractor",
15 |     "nemoretriever_parse_extractor",
16 |     "pdfium_extractor",
17 |     "tika_extractor",
18 |     "unstructured_io_extractor",
19 | ]
20 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/pptx/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | # Copyright (c) 2024, NVIDIA CORPORATION.
6 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/mutate/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/primitives/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/control_message_task.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from uuid import UUID
 6 | 
 7 | from pydantic import BaseModel, Field, ConfigDict
 8 | from typing import Any, Dict, Union
 9 | 
10 | 
11 | class ControlMessageTask(BaseModel):
12 |     model_config = ConfigDict(extra="forbid")
13 | 
14 |     type: str
15 |     id: Union[str, UUID]
16 |     properties: Dict[str, Any] = Field(default_factory=dict)
17 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/nim/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from .nim_client import NimClient
6 | from .nim_model_interface import ModelInterface
7 | 
8 | __all__ = ["NimClient", "ModelInterface"]
9 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/nim/default_values.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # Copyright (c) 2024, NVIDIA CORPORATION.
 6 | 
 7 | 
 8 | YOLOX_MAX_BATCH_SIZE = 8
 9 | YOLOX_MAX_WIDTH = 1536
10 | YOLOX_MAX_HEIGHT = 1536
11 | YOLOX_NUM_CLASSES = 3
12 | YOLOX_CONF_THRESHOLD = 0.01
13 | YOLOX_IOU_THRESHOLD = 0.5
14 | YOLOX_MIN_SCORE = 0.1
15 | YOLOX_FINAL_SCORE = 0.48
16 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import logging
 6 | from functools import wraps
 7 | from multiprocessing import Lock
 8 | from multiprocessing import Manager
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | # Create a shared manager and lock for thread-safe access
13 | manager = Manager()
14 | global_cache = manager.dict()
15 | lock = Lock()
16 | 
17 | 
18 | def multiprocessing_cache(max_calls):
19 |     """
20 |     A decorator that creates a global cache shared between multiple processes.
21 |     The cache is invalidated after `max_calls` number of accesses.
22 | 
23 |     Args:
24 |         max_calls (int): The number of calls after which the cache is cleared.
25 | 
26 |     Returns:
27 |         function: The decorated function with global cache and invalidation logic.
28 |     """
29 | 
30 |     def decorator(func):
31 |         call_count = manager.Value("i", 0)  # Shared integer for call counting
32 | 
33 |         @wraps(func)
34 |         def wrapper(*args, **kwargs):
35 |             key = (func.__name__, args, frozenset(kwargs.items()))
36 | 
37 |             with lock:
38 |                 call_count.value += 1
39 | 
40 |                 if call_count.value > max_calls:
41 |                     global_cache.clear()
42 |                     call_count.value = 0
43 | 
44 |                 if key in global_cache:
45 |                     return global_cache[key]
46 | 
47 |             result = func(*args, **kwargs)
48 | 
49 |             with lock:
50 |                 global_cache[key] = result
51 | 
52 |             return result
53 | 
54 |         return wrapper
55 | 
56 |     return decorator
57 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/primitives/tracing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/primitives/tracing/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/extract/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from typing import Optional, Literal
 7 | 
 8 | from pydantic import Field, BaseModel
 9 | from typing_extensions import Annotated
10 | 
11 | 
12 | class MessageBrokerClientSchema(BaseModel):
13 |     host: str = "redis"
14 |     port: Annotated[int, Field(gt=0, lt=65536)] = 6379
15 | 
16 |     # Update this for new broker types
17 |     client_type: Literal["redis", "simple"] = "redis"  # Restrict to 'redis' or 'simple'
18 | 
19 |     broker_params: Optional[dict] = Field(default_factory=dict)
20 | 
21 |     connection_timeout: Optional[Annotated[int, Field(ge=0)]] = 300
22 |     max_backoff: Optional[Annotated[int, Field(ge=0)]] = 300
23 |     max_retries: Optional[Annotated[int, Field(ge=0)]] = 0
24 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | from typing import Optional
 8 | 
 9 | from pydantic import ConfigDict, BaseModel
10 | from pydantic import Field
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | # Define schemas for request validation
16 | class PushRequestSchema(BaseModel):
17 |     command: str
18 |     queue_name: str = Field(..., min_length=1)
19 |     message: str = Field(..., min_length=1)
20 |     timeout: Optional[float] = 100  # Optional timeout for blocking push
21 |     model_config = ConfigDict(extra="forbid")
22 | 
23 | 
24 | class PopRequestSchema(BaseModel):
25 |     command: str
26 |     queue_name: str = Field(..., min_length=1)
27 |     timeout: Optional[float] = 100  # Optional timeout for blocking pop
28 |     model_config = ConfigDict(extra="forbid")
29 | 
30 | 
31 | class SizeRequestSchema(BaseModel):
32 |     command: str
33 |     queue_name: str = Field(..., min_length=1)
34 |     model_config = ConfigDict(extra="forbid")
35 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # NOTE: This code is duplicated from the ingest service:
 6 | # src/nv_ingest_client/schemas/response_schema.py
 7 | # Eventually we should move all client wrappers for the message broker into a shared library that both the ingest
 8 | # service and the client can use.
 9 | 
10 | from typing import Optional, Union
11 | from pydantic import BaseModel
12 | 
13 | 
14 | class ResponseSchema(BaseModel):
15 |     response_code: int
16 |     response_reason: Optional[str] = "OK"
17 |     response: Union[str, dict, None] = None
18 |     trace_id: Optional[str] = None  # Unique trace ID
19 |     transaction_id: Optional[str] = None  # Unique transaction ID
20 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/meta/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import ConfigDict, BaseModel
 7 | 
 8 | 
 9 | # Define a base class with extra fields forbidden
10 | class BaseModelNoExt(BaseModel):
11 |     model_config = ConfigDict(extra="forbid")
12 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/mutate/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | from pydantic import ConfigDict, BaseModel
 9 | from pydantic import StrictBool
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class ImageDedupSchema(BaseModel):
15 |     raise_on_failure: StrictBool = False
16 |     model_config = ConfigDict(extra="forbid")
17 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/store/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | import logging
20 | 
21 | from pydantic import ConfigDict, BaseModel
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class EmbeddingStorageSchema(BaseModel):
27 |     raise_on_failure: bool = False
28 |     model_config = ConfigDict(extra="forbid")
29 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/store/store_image_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | import logging
20 | 
21 | from pydantic import ConfigDict, BaseModel
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class ImageStorageModuleSchema(BaseModel):
27 |     structured: bool = True
28 |     images: bool = True
29 |     raise_on_failure: bool = False
30 |     model_config = ConfigDict(extra="forbid")
31 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/transform/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import ConfigDict, BaseModel
 7 | 
 8 | 
 9 | class ImageCaptionExtractionSchema(BaseModel):
10 |     api_key: str = "api_key"
11 |     endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
12 |     prompt: str = "Caption the content of this image:"
13 |     model_name: str = "meta/llama-3.2-11b-vision-instruct"
14 |     raise_on_failure: bool = False
15 |     model_config = ConfigDict(extra="forbid")
16 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | from pydantic import ConfigDict, BaseModel
 9 | from pydantic import StrictBool
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class ImageFilterSchema(BaseModel):
15 |     raise_on_failure: StrictBool = False
16 |     cpu_only: StrictBool = False
17 |     model_config = ConfigDict(extra="forbid")
18 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | from pydantic import ConfigDict, BaseModel
 9 | 
10 | from nv_ingest_api.util.logging.configuration import LogLevel
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class TextEmbeddingSchema(BaseModel):
16 |     api_key: str = "api_key"
17 |     batch_size: int = 4
18 |     embedding_model: str = "nvidia/nv-embedqa-e5-v5"
19 |     embedding_nim_endpoint: str = "http://embedding:8000/v1"
20 |     encoding_format: str = "float"
21 |     httpx_log_level: LogLevel = LogLevel.WARNING
22 |     input_type: str = "passage"
23 |     raise_on_failure: bool = False
24 |     truncate: str = "END"
25 |     model_config = ConfigDict(extra="forbid")
26 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from pydantic import Field, BaseModel, field_validator
 6 | 
 7 | from typing import Optional
 8 | 
 9 | from typing_extensions import Annotated
10 | 
11 | 
12 | class TextSplitterSchema(BaseModel):
13 |     tokenizer: Optional[str] = None
14 |     chunk_size: Annotated[int, Field(gt=0)] = 1024
15 |     chunk_overlap: Annotated[int, Field(ge=0)] = 150
16 |     raise_on_failure: bool = False
17 | 
18 |     @field_validator("chunk_overlap")
19 |     def check_chunk_overlap(cls, v, values, **kwargs):
20 |         if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
21 |             raise ValueError("chunk_overlap must be less than chunk_size")
22 |         return v
23 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/store/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/internal/transform/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/control_message/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/control_message/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/control_message/validators.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
 6 | 
 7 | 
 8 | def cm_ensure_payload_not_null(control_message: IngestControlMessage):
 9 |     """
10 |     Ensures that the payload of a IngestControlMessage is not None.
11 | 
12 |     Parameters
13 |     ----------
14 |     control_message : IngestControlMessage
15 |         The IngestControlMessage to check.
16 | 
17 |     Raises
18 |     ------
19 |     ValueError
20 |         If the payload is None.
21 |     """
22 | 
23 |     if control_message.payload() is None:
24 |         raise ValueError("Payload cannot be None")
25 | 
26 | 
27 | def cm_set_failure(control_message: IngestControlMessage, reason: str) -> IngestControlMessage:
28 |     """
29 |     Sets the failure metadata on a IngestControlMessage.
30 | 
31 |     Parameters
32 |     ----------
33 |     control_message : IngestControlMessage
34 |         The IngestControlMessage to set the failure metadata on.
35 |     reason : str
36 |         The reason for the failure.
37 | 
38 |     Returns
39 |     -------
40 |     control_message : IngestControlMessage
41 |         The modified IngestControlMessage with the failure metadata set.
42 |     """
43 | 
44 |     control_message.set_metadata("cm_failed", True)
45 |     control_message.set_metadata("cm_failed_reason", reason)
46 | 
47 |     return control_message
48 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/converters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/converters/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/converters/bytetools.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import base64
 7 | 
 8 | 
 9 | def bytesfromhex(hex_input):
10 |     """
11 |     Function to convert hex to bytes.
12 | 
13 |     Parameters
14 |     ----------
15 |     hex_input : hex
16 |         Hex string to store bytes in cuDF.
17 | 
18 |     Returns
19 |     -------
20 |     bytes
21 |         Hex encoded object converted to bytes.
22 |     """
23 | 
24 |     return bytes.fromhex(hex_input)
25 | 
26 | 
27 | def hexfrombytes(bytes_input):
28 |     """
29 |     Function to bytes to hex string.
30 | 
31 |     Parameters
32 |     ----------
33 |     bytes_input : bytes
34 |         Raw bytes of object.
35 | 
36 |     Returns
37 |     -------
38 |     hex
39 |         Hex string to store bytes in cuDF.
40 |     """
41 | 
42 |     return bytes_input.hex()
43 | 
44 | 
45 | def bytesfrombase64(base64_input):
46 |     """
47 |     Function to convert base64 encoded string to bytes.
48 | 
49 |     Parameters
50 |     ----------
51 |     base64_input : hex
52 |         Base64 encoded string to store bytes in cuDF.
53 | 
54 |     Returns
55 |     -------
56 |     bytes
57 |         Base64 encoded string converted to bytes.
58 |     """
59 | 
60 |     return base64.b64decode(base64_input)
61 | 
62 | 
63 | def base64frombytes(bytes_input, encoding="utf-8"):
64 |     """
65 |     Function to bytes to base64 string.
66 | 
67 |     Parameters
68 |     ----------
69 |     bytes_input : bytes
70 |         Raw bytes of object.
71 | 
72 |     Returns
73 |     -------
74 |     base64
75 |         base64 encoded string to store bytes in cuDF.
76 |     """
77 | 
78 |     return base64.b64encode(bytes_input).decode(encoding)
79 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/converters/type_mappings.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
 5 | from nv_ingest_api.internal.enums.common import ContentTypeEnum
 6 | 
 7 | DOC_TO_CONTENT_MAP = {
 8 |     DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
 9 |     DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
10 |     DocumentTypeEnum.HTML: ContentTypeEnum.TEXT,
11 |     DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
12 |     DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
13 |     DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
14 |     DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE,
15 |     DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED,
16 |     DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE,
17 |     DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE,
18 |     DocumentTypeEnum.TXT: ContentTypeEnum.TEXT,
19 |     DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO,
20 | }
21 | 
22 | 
23 | def doc_type_to_content_type(doc_type: DocumentTypeEnum) -> ContentTypeEnum:
24 |     """
25 |     Convert DocumentTypeEnum to ContentTypeEnum
26 |     """
27 |     return DOC_TO_CONTENT_MAP[doc_type]
28 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | # Copyright (c) 2024, NVIDIA CORPORATION.
6 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/detectors/language.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import langdetect
 7 | 
 8 | from nv_ingest_api.internal.enums.common import LanguageEnum
 9 | from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler
10 | 
11 | 
12 | @langdetect_exception_handler
13 | def detect_language(text):
14 |     """
15 |     Detect spoken language from a string of text.
16 | 
17 |     Parameters
18 |     ----------
19 |     text : str
20 |         A string of text.
21 | 
22 |     Returns
23 |     -------
24 |     LanguageEnum
25 |         A value from `LanguageEnum` detected language code.
26 |     """
27 | 
28 |     try:
29 |         language = langdetect.detect(text)
30 | 
31 |         if LanguageEnum.has_value(language):
32 |             language = LanguageEnum[language.upper().replace("-", "_")]
33 |         else:
34 |             language = LanguageEnum.UNKNOWN
35 |     except langdetect.lang_detect_exception.LangDetectException:
36 |         language = LanguageEnum.UNKNOWN
37 | 
38 |     return language
39 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/exception_handlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/exception_handlers/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/image_processing/__init__.py:
--------------------------------------------------------------------------------
1 | from .transforms import scale_image_to_encoding_size
2 | 
3 | __all__ = [
4 |     "scale_image_to_encoding_size",
5 | ]
6 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/logging/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/logging/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/logging/configuration.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | import sys
 8 | from enum import Enum
 9 | 
10 | 
11 | class LogLevel(str, Enum):
12 |     DEBUG = "DEBUG"
13 |     INFO = "INFO"
14 |     WARNING = "WARNING"
15 |     ERROR = "ERROR"
16 |     CRITICAL = "CRITICAL"
17 | 
18 | 
19 | def configure_logging(logger, level_name):
20 |     """
21 |     Parameters:
22 |     - level_name (str): The name of the logging level (e.g., "DEBUG", "INFO").
23 |     """
24 | 
25 |     numeric_level = getattr(logging, level_name, None)
26 |     if not isinstance(numeric_level, int):
27 |         raise ValueError(f"Invalid log level: {level_name}")
28 | 
29 |     logging.StreamHandler(sys.stdout)
30 |     logging.basicConfig(level=numeric_level, format="%(asctime)s - %(levelname)s - %(message)s")
31 |     logger.setLevel(numeric_level)
32 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/message_brokers/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from .broker import SimpleMessageBroker
 6 | from .broker import ResponseSchema
 7 | from .simple_client import SimpleClient
 8 | 
 9 | __all__ = ["SimpleMessageBroker", "SimpleClient", "ResponseSchema"]
10 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/metadata/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | # Copyright (c) 2024, NVIDIA CORPORATION.
6 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/multi_processing/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | 
6 | from .mp_pool_singleton import ProcessWorkerPoolSingleton
7 | 
8 | __all__ = ["ProcessWorkerPoolSingleton"]
9 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/nim/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from typing import Tuple, Optional
 6 | 
 7 | from nv_ingest_api.internal.primitives.nim.nim_client import NimClient
 8 | from nv_ingest_api.internal.primitives.nim.nim_model_interface import ModelInterface
 9 | 
10 | __all__ = ["create_inference_client"]
11 | 
12 | 
13 | def create_inference_client(
14 |     endpoints: Tuple[str, str],
15 |     model_interface: ModelInterface,
16 |     auth_token: Optional[str] = None,
17 |     infer_protocol: Optional[str] = None,
18 |     timeout: float = 120.0,
19 |     max_retries: int = 5,
20 | ) -> NimClient:
21 |     """
22 |     Create a NimClient for interfacing with a model inference server.
23 | 
24 |     Parameters
25 |     ----------
26 |     endpoints : tuple
27 |         A tuple containing the gRPC and HTTP endpoints.
28 |     model_interface : ModelInterface
29 |         The model interface implementation to use.
30 |     auth_token : str, optional
31 |         Authorization token for HTTP requests (default: None).
32 |     infer_protocol : str, optional
33 |         The protocol to use ("grpc" or "http"). If not specified, it is inferred from the endpoints.
34 | 
35 |     Returns
36 |     -------
37 |     NimClient
38 |         The initialized NimClient.
39 | 
40 |     Raises
41 |     ------
42 |     ValueError
43 |         If an invalid infer_protocol is specified.
44 |     """
45 | 
46 |     grpc_endpoint, http_endpoint = endpoints
47 | 
48 |     if (infer_protocol is None) and (grpc_endpoint and grpc_endpoint.strip()):
49 |         infer_protocol = "grpc"
50 |     elif infer_protocol is None and http_endpoint:
51 |         infer_protocol = "http"
52 | 
53 |     if infer_protocol not in ["grpc", "http"]:
54 |         raise ValueError("Invalid infer_protocol specified. Must be 'grpc' or 'http'.")
55 | 
56 |     return NimClient(model_interface, infer_protocol, endpoints, auth_token, timeout, max_retries)
57 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/pdf/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/schema/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/schema/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/schema/schema_validator.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest_api.util.exception_handlers.schemas import schema_exception_handler
 6 | 
 7 | 
 8 | @schema_exception_handler
 9 | def validate_schema(metadata, Schema):
10 |     return Schema(**metadata)
11 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/service_clients/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/service_clients/kafka/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/service_clients/redis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/service_clients/redis/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/service_clients/rest/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/service_clients/rest/__init__.py


--------------------------------------------------------------------------------
/api/src/nv_ingest_api/util/string_processing/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import logging
 6 | import re
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | DEPLOT_MAX_TOKENS = 128
11 | DEPLOT_TEMPERATURE = 1.0
12 | DEPLOT_TOP_P = 1.0
13 | 
14 | 
15 | def remove_url_endpoints(url) -> str:
16 |     """Some configurations provide the full endpoint in the URL.
17 |     Ex: http://deplot:8000/v1/chat/completions. For hitting the
18 |     health endpoint we need to get just the hostname:port combo
19 |     that we can append the health/ready endpoint to so we attempt
20 |     to parse that information here.
21 | 
22 |     Args:
23 |         url str: Incoming URL
24 | 
25 |     Returns:
26 |         str: URL with just the hostname:port portion remaining
27 |     """
28 |     if "/v1" in url:
29 |         url = url.split("/v1")[0]
30 | 
31 |     return url
32 | 
33 | 
34 | def generate_url(url) -> str:
35 |     """Examines the user defined URL for http*://. If that
36 |     pattern is detected the URL is used as provided by the user.
37 |     If that pattern does not exist then the assumption is made that
38 |     the endpoint is simply `http://` and that is prepended
39 |     to the user supplied endpoint.
40 | 
41 |     Args:
42 |         url str: Endpoint where the Rest service is running
43 | 
44 |     Returns:
45 |         str: Fully validated URL
46 |     """
47 |     if not re.match(r"^https?://", url):
48 |         # Add the default `http://` if it's not already present in the URL
49 |         url = f"http://{url}"
50 | 
51 |     return url
52 | 


--------------------------------------------------------------------------------
/api/src/version.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import datetime
 7 | import os
 8 | import re
 9 | 
10 | 
11 | def get_version():
12 |     release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev")
13 |     version = os.getenv("NV_INGEST_VERSION")
14 |     rev = os.getenv("NV_INGEST_REV", "0")
15 | 
16 |     if not version:
17 |         version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18 | 
19 |     # Ensure the version is PEP 440 compatible
20 |     pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
21 |     if not re.match(pep440_regex, version):
22 |         raise ValueError(f"Version '{version}' is not PEP 440 compatible")
23 | 
24 |     # Construct the final version string
25 |     if release_type == "dev":
26 |         # If rev is not specified and defaults to 0 lets create a more meaningful development
27 |         # identifier that is pep440 compliant
28 |         if int(rev) == 0:
29 |             rev = datetime.datetime.now().strftime("%Y%m%d")
30 |         final_version = f"{version}.dev{rev}"
31 |     elif release_type == "release":
32 |         final_version = f"{version}.post{rev}" if int(rev) > 0 else version
33 |     else:
34 |         raise ValueError(f"Invalid release type: {release_type}")
35 | 
36 |     return final_version
37 | 


--------------------------------------------------------------------------------
/ci/scripts/build_pip_packages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to display usage
 4 | usage() {
 5 |     echo "Usage: $0 --type <dev|release> --lib <api|client|service>"
 6 |     exit 1
 7 | }
 8 | 
 9 | # Get the directory of the current script
10 | SCRIPT_DIR=$(dirname "$(realpath "$0")")
11 | 
12 | # Parse options
13 | while [[ "$#" -gt 0 ]]; do
14 |     case $1 in
15 |         --type) TYPE="$2"; shift ;;
16 |         --lib) LIBRARY="$2"; shift  ;;
17 |         *) usage ;;
18 |     esac
19 |     shift
20 | done
21 | 
22 | # Validate input
23 | if [[ -z "$TYPE" || -z "$LIBRARY" ]]; then
24 |     usage
25 | fi
26 | 
27 | # Get current date
28 | DATE=$(date +'%Y.%m.%d')
29 | 
30 | # Set the version based on the build type
31 | if [[ "$TYPE" == "dev" ]]; then
32 |     VERSION_SUFFIX="${DATE}-dev"
33 | elif [[ "$TYPE" == "release" ]]; then
34 |     VERSION_SUFFIX="${DATE}"
35 | else
36 |     echo "Invalid type: $TYPE"
37 |     usage
38 | fi
39 | 
40 | # Set library-specific variables and paths
41 | if [[ "$LIBRARY" == "api" ]]; then
42 |     NV_INGEST_VERSION_OVERRIDE="${VERSION_SUFFIX}"
43 |     export NV_INGEST_VERSION_OVERRIDE
44 |     SETUP_PATH="$SCRIPT_DIR/../../api/pyproject.toml"
45 |     (cd "$(dirname "$SETUP_PATH")" && python -m build)
46 | elif [[ "$LIBRARY" == "client" ]]; then
47 |     NV_INGEST_VERSION_OVERRIDE="${VERSION_SUFFIX}"
48 |     export NV_INGEST_VERSION_OVERRIDE
49 |     SETUP_PATH="$SCRIPT_DIR/../../client/pyproject.toml"
50 |     (cd "$(dirname "$SETUP_PATH")" && python -m build)
51 | elif [[ "$LIBRARY" == "service" ]]; then
52 |     NV_INGEST_SERVICE_VERSION_OVERRIDE="${VERSION_SUFFIX}"
53 |     export NV_INGEST_SERVICE_VERSION_OVERRIDE
54 |     SETUP_PATH="$SCRIPT_DIR/../../setup.py"
55 |     (cd "$(dirname "$SETUP_PATH")" && python setup.py sdist bdist_wheel)
56 | else
57 |     echo "Invalid library: $LIBRARY"
58 |     usage
59 | fi
60 | 


--------------------------------------------------------------------------------
/client/MANIFEST.in:
--------------------------------------------------------------------------------
1 | exclude *.egg-info
2 | 
3 | include README.md
4 | include LICENSE
5 | recursive-include src/nv_ingest_client
6 | include src/version.py
7 | global-exclude __pycache__
8 | global-exclude *.pyc
9 | 


--------------------------------------------------------------------------------
/client/client_examples/docker/Dockerfile.client:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | # Use tini init for container
 4 | ENV TINI_VERSION v0.19.0
 5 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /bin/tini
 6 | RUN chmod +x /bin/tini
 7 | 
 8 | # Include files to launch jupyter notebook
 9 | RUN mkdir -p /workspace/docker
10 | COPY docker/start-jupyter.sh /workspace/docker/start-jupyter.sh
11 | COPY docker/entrypoint.sh /workspace/docker/entrypoint.sh
12 | RUN chmod +x /workspace/docker/start-jupyter.sh
13 | RUN chmod +x /workspace/docker/entrypoint.sh
14 | 
15 | # Install some dependencies and useful utiliites
16 | RUN apt update && apt install -y python3-pip git tree \
17 |     && rm -rf /var/lib/apt/lists/*
18 | 
19 | # Install the nv-ingest client library
20 | RUN cd /workspace \
21 |     && git clone https://github.com/NVIDIA/nv-ingest.git \
22 |     && cd /workspace/nv-ingest/client \
23 |     && pip install .
24 | COPY examples /workspace/client_examples/examples
25 | 
26 | # Install jupyter lab
27 | RUN pip install jupyterlab
28 | 
29 | WORKDIR /workspace/client_examples
30 | 
31 | ENTRYPOINT [ "/bin/tini", "--", "/workspace/docker/entrypoint.sh" ]
32 | 


--------------------------------------------------------------------------------
/client/client_examples/docker/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Start jupyter server
4 | /workspace/docker/start-jupyter.sh > /dev/null
5 | echo "There was a jupyter-lab instance started on port 8888, http://127.0.0.1:8888"
6 | 
7 | # Run whatever user wants
8 | exec "$@"
9 | 


--------------------------------------------------------------------------------
/client/client_examples/docker/start-jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nohup jupyter-lab --allow-root --ip=0.0.0.0 --port=8888 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
3 | 


--------------------------------------------------------------------------------
/client/client_tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/cli/test_nv_ingest_cli.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | # TODO(Devin): Just for coverage at the moment
6 | import nv_ingest_client.nv_ingest_cli as nv_ingest_cli  # noqa: F401
7 | 


--------------------------------------------------------------------------------
/client/client_tests/cli/util/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/client/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/client/test_rest_client.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from unittest.mock import MagicMock
 6 | 
 7 | import pytest
 8 | 
 9 | from nv_ingest_api.util.service_clients.rest.rest_client import RestClient
10 | 
11 | 
12 | class MockRestClient:
13 |     def __init__(self, host, port):
14 |         self.host = host
15 |         self.port = port
16 |         self.counter = 0
17 | 
18 |     def get_client(self):
19 |         return self
20 | 
21 | 
22 | @pytest.fixture
23 | def mock_rest_client_allocator():
24 |     return MagicMock(return_value=MockRestClient("localhost", 7670))
25 | 
26 | 
27 | @pytest.fixture
28 | def rest_client(mock_rest_client_allocator):
29 |     return RestClient(
30 |         host="localhost",
31 |         port=7670,
32 |         max_retries=0,
33 |         max_backoff=32,
34 |         http_allocator=mock_rest_client_allocator,
35 |     )
36 | 
37 | 
38 | # Test generate_url function
39 | def test_generate_url(rest_client):
40 |     assert rest_client._generate_url("localhost", 7670) == "http://localhost:7670"
41 |     assert rest_client._generate_url("http://localhost", 7670) == "http://localhost:7670"
42 |     assert rest_client._generate_url("https://localhost", 7670) == "https://localhost:7670"
43 | 
44 |     # A few more complicated and possible tricks
45 |     assert rest_client._generate_url("localhost-https-else", 7670) == "http://localhost-https-else:7670"
46 | 


--------------------------------------------------------------------------------
/client/client_tests/primitives/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/primitives/jobs/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/primitives/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/client_tests/primitives/tasks/test_store_embed.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pytest
 6 | from nv_ingest_client.primitives.tasks.store import StoreEmbedTask
 7 | 
 8 | # Initialization and Property Setting
 9 | 
10 | 
11 | def test_store_task_initialization():
12 |     task = StoreEmbedTask(
13 |         params={
14 |             "endpoint": "minio:9000",
15 |             "access_key": "foo",
16 |             "secret_key": "bar",
17 |         }
18 |     )
19 |     assert task._params["endpoint"] == "minio:9000"
20 |     assert task._params["access_key"] == "foo"
21 |     assert task._params["secret_key"] == "bar"
22 | 
23 | 
24 | # String Representation Tests
25 | 
26 | 
27 | def test_store_task_str_representation():
28 |     task = StoreEmbedTask(params={"endpoint": "minio:9000"})
29 |     expected_str = "Store Embed Task:\n" "  endpoint: minio:9000\n"
30 |     assert str(task) == expected_str
31 | 
32 | 
33 | # Dictionary Representation Tests
34 | 
35 | 
36 | @pytest.mark.parametrize(
37 |     "extra_param_1, extra_param_2",
38 |     [
39 |         ("foo", "bar"),
40 |     ],
41 | )
42 | def test_store_task_to_dict(
43 |     extra_param_1,
44 |     extra_param_2,
45 | ):
46 |     task = StoreEmbedTask(
47 |         params={
48 |             "extra_param_1": extra_param_1,
49 |             "extra_param_2": extra_param_2,
50 |         }
51 |     )
52 | 
53 |     expected_dict = {"type": "store_embedding", "task_properties": {"params": {}}}
54 | 
55 |     expected_dict["task_properties"]["params"]["extra_param_1"] = extra_param_1
56 |     expected_dict["task_properties"]["params"]["extra_param_2"] = extra_param_2
57 | 
58 |     assert task.to_dict() == expected_dict, "The to_dict method did not return the expected dictionary representation"
59 | 


--------------------------------------------------------------------------------
/client/client_tests/primitives/tasks/test_task_base.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pytest
 6 | from nv_ingest_client.primitives.tasks.task_base import Task
 7 | from nv_ingest_client.primitives.tasks.task_base import TaskType
 8 | from nv_ingest_client.primitives.tasks.task_base import is_valid_task_type
 9 | 
10 | # TaskType Enum Tests
11 | 
12 | 
13 | def test_task_type_enum_valid_values():
14 |     for task_type in TaskType:
15 |         assert isinstance(task_type, TaskType), f"{task_type} should be an instance of TaskType Enum"
16 | 
17 | 
18 | def test_task_type_enum_invalid_value():
19 |     invalid_task_type = "INVALID"
20 |     assert not is_valid_task_type(
21 |         invalid_task_type
22 |     ), f"'{invalid_task_type}' should not be recognized as a valid TaskType"
23 | 
24 | 
25 | # is_valid_task_type Function Tests
26 | 
27 | 
28 | @pytest.mark.parametrize("valid_task_type", [task_type.name for task_type in TaskType])
29 | def test_is_valid_task_type_with_valid_types(valid_task_type):
30 |     assert is_valid_task_type(valid_task_type), f"{valid_task_type} should be recognized as a valid TaskType"
31 | 
32 | 
33 | def test_is_valid_task_type_with_invalid_type():
34 |     invalid_task_type = "NON_EXISTENT_TASK"
35 |     assert not is_valid_task_type(
36 |         invalid_task_type
37 |     ), f"{invalid_task_type} should not be recognized as a valid TaskType"
38 | 
39 | 
40 | # Task Class Tests
41 | 
42 | 
43 | def test_task_str_method():
44 |     task = Task()
45 |     expected_str = f"{task.__class__.__name__}\n"
46 |     assert str(task) == expected_str, "The __str__ method of Task does not return the expected string format"
47 | 
48 | 
49 | def test_task_to_dict_method():
50 |     task = Task()
51 |     expected_dict = {}
52 |     assert task.to_dict() == expected_dict, (
53 |         "The to_dict method of Task should return an empty dictionary for a " "generic task"
54 |     )
55 | 


--------------------------------------------------------------------------------
/client/client_tests/util/file_processing/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "nv-ingest-client"
 7 | description = "Python client for the nv-ingest service"
 8 | dynamic = ["version"]
 9 | readme = "README.md"
10 | authors = [
11 |     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
12 | ]
13 | license = {file = "LICENSE"}
14 | requires-python = ">=3.10"
15 | classifiers = [
16 |     "Programming Language :: Python :: 3.10",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "azure-storage-blob==12.24.0",
21 |     "build>=1.2.2",
22 |     "charset-normalizer>=3.4.1",
23 |     "click>=8.1.8",
24 |     "fsspec>=2025.2.0",
25 |     "httpx==0.27.2",
26 |     "langchain-milvus==0.1.7",
27 |     "langchain-nvidia-ai-endpoints>=0.3.7",
28 |     "llama-index-embeddings-nvidia==0.1.5",
29 |     "minio>=7.2.15",
30 |     "nv-ingest-api==25.4.2",
31 |     "openai~=1.68.1",
32 |     "pyarrow>=19.0.0",
33 |     "pydantic>2.0.0",
34 |     "pydantic-settings>2.0.0",
35 |     "pymilvus==2.5.4",
36 |     "pymilvus[bulk_writer,model]",
37 |     "pypdfium2>=4.30.1",
38 |     "python-docx>=1.1.2",
39 |     "python-magic>=0.4.27",
40 |     "python-pptx==0.6.23",
41 |     "redis~=5.2.1",
42 |     "requests>=2.28.2",
43 |     "setuptools>=58.2.0",
44 |     "tqdm>=4.67.1",
45 | ]
46 | 
47 | [project.urls]
48 | homepage = "https://github.com/NVIDIA/nv-ingest"
49 | repository = "https://github.com/NVIDIA/nv-ingest"
50 | documentation = "https://docs.nvidia.com/nv-ingest"
51 | 
52 | [project.scripts]
53 | nv-ingest-cli = "nv_ingest_client.nv_ingest_cli:main"
54 | process-json-files = "nv_ingest_client.util.process_json_files:main"
55 | 
56 | [tool.setuptools]
57 | py-modules = ["nv_ingest_client"]
58 | 
59 | [tool.setuptools.packages.find]
60 | where = ["src"]
61 | 
62 | [tool.setuptools.dynamic]
63 | version = {attr = "version.get_version"}
64 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/cli/util/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/cli/util/tasks.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/client/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from nv_ingest_client.client.client import NvIngestClient
6 | from nv_ingest_client.client.interface import Ingestor
7 | 
8 | __all__ = ["NvIngestClient", "Ingestor"]
9 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from .jobs import BatchJobSpec
 6 | from .jobs import JobSpec
 7 | from .tasks import Task
 8 | 
 9 | __all__ = ["BatchJobSpec", "JobSpec", "Task"]
10 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/exceptions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/primitives/exceptions.py


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/jobs/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest_client.primitives.jobs.job_spec import BatchJobSpec
 6 | from nv_ingest_client.primitives.jobs.job_spec import JobSpec
 7 | from nv_ingest_client.primitives.jobs.job_state import JobState
 8 | from nv_ingest_client.primitives.jobs.job_state import JobStateEnum
 9 | 
10 | __all__ = ["BatchJobSpec", "JobSpec", "JobState", "JobStateEnum"]
11 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from .audio_extraction import AudioExtractionTask
 6 | from .caption import CaptionTask
 7 | from .chart_extraction import ChartExtractionTask
 8 | from .dedup import DedupTask
 9 | from .embed import EmbedTask
10 | from .extract import ExtractTask
11 | from .filter import FilterTask
12 | from .infographic_extraction import InfographicExtractionTask
13 | from .split import SplitTask
14 | from .store import StoreTask
15 | from .store import StoreEmbedTask
16 | from .table_extraction import TableExtractionTask
17 | from .task_base import Task
18 | from .task_base import TaskType
19 | from .task_base import is_valid_task_type
20 | from .task_factory import task_factory
21 | 
22 | __all__ = [
23 |     "AudioExtractionTask",
24 |     "CaptionTask",
25 |     "ChartExtractionTask",
26 |     "ExtractTask",
27 |     "is_valid_task_type",
28 |     "InfographicExtractionTask",
29 |     "SplitTask",
30 |     "StoreEmbedTask",
31 |     "StoreTask",
32 |     "TableExtractionTask",
33 |     "Task",
34 |     "task_factory",
35 |     "TaskType",
36 |     "DedupTask",
37 |     "FilterTask",
38 |     "EmbedTask",
39 | ]
40 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/tasks/chart_extraction.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | # pylint: disable=too-few-public-methods
 7 | # pylint: disable=too-many-arguments
 8 | 
 9 | import logging
10 | from typing import Dict
11 | 
12 | from pydantic import BaseModel
13 | 
14 | from .task_base import Task
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class ChartExtractionSchema(BaseModel):
20 |     class Config:
21 |         extra = "forbid"
22 | 
23 | 
24 | class ChartExtractionTask(Task):
25 |     """
26 |     Object for chart extraction task
27 |     """
28 | 
29 |     def __init__(self) -> None:
30 |         """
31 |         Setup Dedup Task Config
32 |         """
33 |         super().__init__()
34 | 
35 |     def __str__(self) -> str:
36 |         """
37 |         Returns a string with the object's config and run time state
38 |         """
39 |         info = ""
40 |         info += "chart extraction task\n"
41 |         return info
42 | 
43 |     def to_dict(self) -> Dict:
44 |         """
45 |         Convert to a dict for submission to redis
46 |         """
47 | 
48 |         task_properties = {
49 |             "params": {},
50 |         }
51 | 
52 |         return {"type": "chart_data_extract", "task_properties": task_properties}
53 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/tasks/infographic_extraction.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | # pylint: disable=too-few-public-methods
 7 | # pylint: disable=too-many-arguments
 8 | 
 9 | import logging
10 | from typing import Dict
11 | 
12 | from pydantic import BaseModel
13 | 
14 | from .task_base import Task
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class InfographicExtractionSchema(BaseModel):
20 |     class Config:
21 |         extra = "forbid"
22 | 
23 | 
24 | class InfographicExtractionTask(Task):
25 |     """
26 |     Object for infographic extraction task
27 |     """
28 | 
29 |     def __init__(self) -> None:
30 |         """
31 |         Setup Dedup Task Config
32 |         """
33 |         super().__init__()
34 | 
35 |     def __str__(self) -> str:
36 |         """
37 |         Returns a string with the object's config and run time state
38 |         """
39 |         info = ""
40 |         info += "infographic extraction task\n"
41 |         return info
42 | 
43 |     def to_dict(self) -> Dict:
44 |         """
45 |         Convert to a dict for submission to redis
46 |         """
47 | 
48 |         task_properties = {
49 |             "params": {},
50 |         }
51 | 
52 |         return {"type": "infographic_data_extract", "task_properties": task_properties}
53 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/tasks/table_extraction.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | # pylint: disable=too-few-public-methods
 7 | # pylint: disable=too-many-arguments
 8 | 
 9 | import logging
10 | from typing import Dict
11 | 
12 | from pydantic import BaseModel
13 | 
14 | from .task_base import Task
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class TableExtractionSchema(BaseModel):
20 |     class Config:
21 |         extra = "forbid"
22 | 
23 | 
24 | class TableExtractionTask(Task):
25 |     """
26 |     Object for table extraction tasks
27 |     """
28 | 
29 |     def __init__(self) -> None:
30 |         """
31 |         Setup Dedup Task Config
32 |         """
33 |         super().__init__()
34 | 
35 |     def __str__(self) -> str:
36 |         """
37 |         Returns a string with the object's config and run time state
38 |         """
39 |         info = ""
40 |         info += "table extraction task\n"
41 |         return info
42 | 
43 |     def to_dict(self) -> Dict:
44 |         """
45 |         Convert to a dict for submission to redis
46 |         """
47 | 
48 |         task_properties = {
49 |             "params": {},
50 |         }
51 | 
52 |         return {"type": "table_data_extract", "task_properties": task_properties}
53 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/tasks/transform.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/primitives/tasks/transform.py


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/primitives/tasks/vdb_upload.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | # pylint: disable=too-few-public-methods
 7 | # pylint: disable=too-many-arguments
 8 | 
 9 | import logging
10 | from typing import Dict
11 | 
12 | from pydantic import BaseModel
13 | 
14 | from .task_base import Task
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class VdbUploadTaskSchema(BaseModel):
20 |     filter_errors: bool = False
21 |     bulk_ingest: bool = False
22 |     bulk_ingest_path: str = ""
23 |     params: dict = None
24 | 
25 |     class Config:
26 |         extra = "forbid"
27 | 
28 | 
29 | class VdbUploadTask(Task):
30 |     """
31 |     Object for document embedding task
32 |     """
33 | 
34 |     def __init__(
35 |         self,
36 |         filter_errors: bool = False,
37 |         bulk_ingest: bool = False,
38 |         bulk_ingest_path: str = "embeddings/",
39 |         params: dict = None,
40 |     ) -> None:
41 |         """
42 |         Setup VDB Upload Task Config
43 |         """
44 |         super().__init__()
45 |         self._filter_errors = filter_errors
46 |         self._bulk_ingest = bulk_ingest
47 |         self._bulk_ingest_path = bulk_ingest_path
48 |         self._params = params or {}
49 | 
50 |     def __str__(self) -> str:
51 |         """
52 |         Returns a string with the object's config and run time state
53 |         """
54 |         info = ""
55 |         info += "VDB Upload Task:\n"
56 |         info += f"  filter_errors: {self._filter_errors}\n"
57 |         return info
58 | 
59 |     def to_dict(self) -> Dict:
60 |         """
61 |         Convert to a dict for submission to redis
62 |         """
63 | 
64 |         task_properties = {
65 |             "filter_errors": self._filter_errors,
66 |             "bulk_ingest": self._bulk_ingest,
67 |             "bulk_ingest_path": self._bulk_ingest_path,
68 |             "params": self._params,
69 |         }
70 | 
71 |         return {"type": "vdb_upload", "task_properties": task_properties}
72 | 


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/util/__init__.py


--------------------------------------------------------------------------------
/client/src/nv_ingest_client/util/file_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/util/file_processing/__init__.py


--------------------------------------------------------------------------------
/client/src/version.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import datetime
 7 | import os
 8 | import re
 9 | 
10 | 
11 | def get_version():
12 |     release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev")
13 |     version = os.getenv("NV_INGEST_VERSION")
14 |     rev = os.getenv("NV_INGEST_REV", "0")
15 | 
16 |     if not version:
17 |         version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18 | 
19 |     # We only check this for dev, we assume for release the user knows what they are doing
20 |     if release_type != "release":
21 |         # Ensure the version is PEP 440 compatible
22 |         pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23 |         if not re.match(pep440_regex, version):
24 |             raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25 | 
26 |     # Construct the final version string
27 |     if release_type == "dev":
28 |         # If rev is not specified and defaults to 0 lets create a more meaningful development
29 |         # identifier that is pep440 compliant
30 |         if int(rev) == 0:
31 |             rev = datetime.datetime.now().strftime("%Y%m%d")
32 |         final_version = f"{version}.dev{rev}"
33 |     elif release_type == "release":
34 |         final_version = f"{version}.post{rev}" if int(rev) > 0 else version
35 |     else:
36 |         raise ValueError(f"Invalid release type: {release_type}")
37 | 
38 |     return final_version
39 | 


--------------------------------------------------------------------------------
/conda/environments/nv_ingest_api_environment.yml:
--------------------------------------------------------------------------------
 1 | name: nv_ingest_api
 2 | channels:
 3 |   - nvidia/label/dev
 4 |   - rapidsai
 5 |   - nvidia
 6 |   - conda-forge
 7 |   - pytorch
 8 | dependencies:
 9 |   - diskcache>=5.6.3
10 |   - pydantic>2.0.0
11 |   - pydantic-settings>2.0.0
12 |   - pytest>=8.0.2
13 |   - pytest-mock>=3.14.0
14 |   - pytest-cov>=6.0.0
15 |   - python>=3.10
16 |   - python-build>=1.2.2
17 |   - setuptools>=58.2.0
18 |   - pip
19 | 


--------------------------------------------------------------------------------
/conda/environments/nv_ingest_client_environment.yml:
--------------------------------------------------------------------------------
 1 | name: nv_ingest_client
 2 | channels:
 3 |   - nvidia/label/dev
 4 |   - rapidsai
 5 |   - nvidia
 6 |   - conda-forge
 7 | dependencies:
 8 |   - click>=8.1.7
 9 |   - diskcache>=5.6.3
10 |   - fsspec>=2024.10.0
11 |   - httpx>=0.28.1
12 |   - pydantic>=2.10.3
13 |   - pypdfium2>=4.30.0
14 |   - pytest>=8.0.2
15 |   - pytest-mock>=3.14.0
16 |   - pytest-cov>=6.0.0
17 |   - python>=3.10
18 |   - python-build>=1.2.2
19 |   - python-docx>=1.1.2
20 |   - python-pptx>=1.0.2
21 |   - requests>=2.28.2
22 |   - setuptools>=58.2.0
23 |   - tqdm>=4.67.1
24 |   - pip
25 | 


--------------------------------------------------------------------------------
/conda/environments/nv_ingest_environment.yml:
--------------------------------------------------------------------------------
 1 | name: nv_ingest_runtime
 2 | channels:
 3 |   - nvidia/label/dev
 4 |   - rapidsai
 5 |   - nvidia
 6 |   - conda-forge
 7 |   - pytorch
 8 | dependencies:
 9 |   - azure-core>=1.32.0
10 |   - click>=8.1.7
11 |   - diskcache>=5.6.3
12 |   - fastapi>=0.115.6
13 |   - fastparquet>=2024.11.0
14 |   - fsspec>=2024.10.0
15 |   - gunicorn
16 |   - h11>=0.16.0 # Must pin at or above 0.16.0 for CVE mitigation
17 |   - httpx>=0.28.1
18 |   - isodate>=0.7.2
19 |   - langdetect>=1.0.9
20 |   - minio>=7.2.12
21 |   - morpheus-core=25.02
22 |   - morpheus-llm=25.02
23 |   - openai>=1.57.1
24 |   - opentelemetry-api>=1.27.0
25 |   - opentelemetry-exporter-otlp>=1.27.0
26 |   - opentelemetry-sdk>=1.27.0
27 |   - pydantic>2.0.0
28 |   - pydantic-settings>2.0.0
29 |   - pypdfium2>=4.30.0
30 |   - pytest>=8.0.2
31 |   - pytest-mock>=3.14.0
32 |   - pytest-cov>=6.0.0
33 |   - python>=3.10
34 |   - python-build>=1.2.2
35 |   - python-docx>=1.1.2
36 |   - python-dotenv>=1.0.1
37 |   - python-pptx>=1.0.2
38 |   - pytorch
39 |   - redis-py>=5.2.1
40 |   - requests>=2.28.2
41 |   - scipy>=1.15.1
42 |   - setuptools>=58.2.0
43 |   - tabulate>=0.9.0
44 |   - torchvision
45 |   - torchaudio
46 |   - transformers>=4.47.0
47 |   - tqdm>=4.67.1
48 |   - uvicorn
49 |   - pip
50 |   - pip:
51 |       - llama-index-embeddings-nvidia
52 |       - opencv-python # For some reason conda cant solve our req set with py-opencv so we need to use pip
53 |       - pymilvus>=2.5.0
54 |       - pymilvus[bulk_writer, model]
55 |       - nvidia-riva-client>=2.18.0
56 |       - unstructured-client
57 | 


--------------------------------------------------------------------------------
/conda/packages/nv_ingest_api/meta.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | {% set py_version = environ['CONDA_PY'] %}
 6 | {% set GIT_SHA = environ['GIT_SHA'] %}
 7 | 
 8 | # Determine Git root, falling back to default path ../../.. if Git is not available or the directory is not a Git repo
 9 | {% set git_root = environ.get('GIT_ROOT', '../../../api') %}
10 | 
11 | package:
12 |   name: nv_ingest_api
13 |   version: {{ environ.get('NV_INGEST_API_VERSION', 'Unknown') }}
14 | 
15 | source:
16 |   path: {{ git_root }}
17 | 
18 | build:
19 |   number: 0
20 |   string: py{{ py_version }}_{{ GIT_SHA }}
21 |   script:
22 |     - {{ PYTHON }} -m pip install . --no-deps -vv
23 | 
24 | requirements:
25 |   build:
26 |     - pip
27 |     - python=3.10
28 |     - setuptools>=58.2.0
29 |   run:
30 |     - azure-core>=1.32.0
31 |     - fastparquet>=2024.11.0
32 |     - fsspec>=2024.10.0
33 |     - httpx>=0.28.1
34 |     - isodate>=0.7.2
35 |     - langdetect>=1.0.9
36 |     - openai>=1.57.1
37 |     - pydantic>=2.0.0
38 |     - pypdfium2>=4.30.0
39 |     - pytest>=8.0.2
40 |     - pytest-mock>=3.14.0
41 |     - python>=3.10
42 |     - python-docx>=1.1.2
43 |     - python-dotenv>=1.0.1
44 |     - python-magic>=0.4.27
45 |     - python-pptx>=1.0.2
46 |     - pytorch
47 |     - requests>=2.28.2
48 |     - setuptools>=58.2.0
49 |     - tabulate>=0.9.0
50 |     - torchaudio
51 |     - torchvision
52 |     - transformers>=4.47.0
53 |     # - unstructured-client>=0.25.9
54 | 
55 |   test:
56 |     commands:
57 |       - pytest ./tests
58 | 
59 | about:
60 |   home: "https://github.com/NVIDIA/nv-ingest"
61 |   license: "Apache-2.0"
62 |   summary: "Python module with core document ingestion functions."
63 |   description: "Python module with core document ingestion functions."
64 | 
65 | extra:
66 |   recipe-maintainers:
67 |     - jdyer@nvidia.com
68 | 
69 | channels:
70 |   - rapidsai
71 |   - nvidia
72 |   - conda-forge
73 |   - pytorch
74 | 


--------------------------------------------------------------------------------
/conda/packages/nv_ingest_client/meta.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | {% set py_version = environ['CONDA_PY'] %}
 6 | {% set GIT_SHA = environ['GIT_SHA'] %}
 7 | 
 8 | # Determine Git root, falling back to default path ../../.. if Git is not available or the directory is not a Git repo
 9 | {% set git_root = environ.get('GIT_ROOT', '../../../client') %}
10 | 
11 | package:
12 |   name: nv_ingest_client
13 |   version: {{ environ.get('NV_INGEST_CLIENT_VERSION', 'Unknown') }}
14 | 
15 | source:
16 |   path: {{ git_root }}
17 | 
18 | build:
19 |   number: 0
20 |   string: py{{ py_version }}_{{ GIT_SHA }}
21 |   script:
22 |     - {{ PYTHON }} -m pip install . --no-deps -vv
23 | 
24 | requirements:
25 |   build:
26 |     - pip
27 |     - python=3.10
28 |     - setuptools>=58.2.0
29 |   run:
30 |     - click>=8.1.7
31 |     - fsspec>=2024.10.0
32 |     - httpx>=0.28.1
33 |     - pydantic>=2.0.0
34 |     - pypdfium2>=4.30.0
35 |     - python>=3.10
36 |     - python-docx>=1.1.2
37 |     - python-pptx>=1.0.2
38 |     - requests>=2.28.2
39 |     - setuptools>=58.2.0
40 |     - tqdm>=4.67.1
41 | 
42 |   test:
43 |     commands:
44 |       - pytest ./tests
45 | 
46 | about:
47 |   home: "https://github.com/NVIDIA/nv-ingest"
48 |   license: "Apache-2.0"
49 |   summary: "Python module supporting document ingestion."
50 |   description: "Python module supporting document ingestion."
51 | 
52 | extra:
53 |   recipe-maintainers:
54 |     - drobison@nvidia.com
55 | 
56 | channels:
57 |   - rapidsai
58 |   - nvidia
59 |   - conda-forge
60 | 


--------------------------------------------------------------------------------
/conda/scripts/helper_functions.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 3 | # All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | 
 6 | # Fail on errors and undefined variables
 7 | set -euo pipefail
 8 | 
 9 | validate_conda_build_environment() {
10 |     ##############################
11 |     # Validate Dependencies
12 |     ##############################
13 | 
14 |     # Ensure conda is installed
15 |     if ! command -v conda &> /dev/null; then
16 |         echo "Error: conda not found in PATH. Please ensure Conda is installed and available."
17 |         exit 1
18 |     fi
19 | 
20 |     # Ensure conda-build is installed
21 |     if ! command -v conda-build &> /dev/null; then
22 |         echo "Error: conda-build not found in PATH. Install it via: conda install conda-build"
23 |         exit 1
24 |     fi
25 | 
26 |     # Ensure git is installed
27 |     if ! command -v git &> /dev/null; then
28 |         echo "Error: git not found in PATH. Please ensure Git is installed and available."
29 |         exit 1
30 |     fi
31 | }
32 | 
33 | determine_git_root() {
34 |     ##############################
35 |     # Determine Git Root
36 |     ##############################
37 | 
38 |     if git rev-parse --is-inside-work-tree &> /dev/null; then
39 |         echo "$(git rev-parse --show-toplevel)"
40 |     else
41 |         echo "Error: Not inside a Git repository. Unable to determine the Git root."
42 |         exit 1
43 |     fi
44 | }
45 | 


--------------------------------------------------------------------------------
/config/otel-collector-config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 3 | # All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | 
 6 | receivers:
 7 |   otlp:
 8 |     protocols:
 9 |       grpc:
10 |         endpoint: 0.0.0.0:4317
11 |       http:
12 |         endpoint: 0.0.0.0:4318
13 | 
14 | exporters:
15 |   # NOTE: Prior to v0.86.0 use `logging` instead of `debug`.
16 |   zipkin:
17 |     endpoint: "http://zipkin:9411/api/v2/spans"
18 |   logging:
19 |     verbosity: detailed
20 |   prometheus:
21 |     endpoint: "0.0.0.0:8889"
22 | 
23 | processors:
24 |   batch:
25 |   tail_sampling:
26 |     policies: [
27 |       {
28 |         name: filter_http_url,
29 |         type: string_attribute,
30 |         string_attribute: {
31 |           key: http.route,
32 |           values: [ "/health/ready" ],
33 |           enabled_regex_matching: true,
34 |           invert_match: true
35 |         }
36 |       }
37 |     ]
38 | 
39 | extensions:
40 |   health_check:
41 |   zpages:
42 | 
43 | service:
44 |   extensions: [zpages, health_check]
45 |   telemetry:
46 |     logs:
47 |       level: "debug"
48 |   pipelines:
49 |     traces:
50 |       receivers: [otlp]
51 |       processors: [batch, tail_sampling]
52 |       exporters: [zipkin, logging]
53 |     metrics:
54 |       receivers: [otlp]
55 |       processors: [batch]
56 |       exporters: [prometheus, logging]
57 |     logs:
58 |       receivers: [otlp]
59 |       processors: [batch]
60 |       exporters: [logging]
61 | 


--------------------------------------------------------------------------------
/config/prometheus.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 3 | # All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | 
 6 | scrape_configs:
 7 |   - job_name: "otel-collector"
 8 |     scrape_interval: 5s
 9 |     static_configs:
10 |       - targets: ["otel-collector:8889"]
11 |       - targets: ["otel-collector:9988"]
12 | 


--------------------------------------------------------------------------------
/data/chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/chart.png


--------------------------------------------------------------------------------
/data/embedded_table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/embedded_table.pdf


--------------------------------------------------------------------------------
/data/functional_validation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/functional_validation.pdf


--------------------------------------------------------------------------------
/data/multimodal_test.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.bmp


--------------------------------------------------------------------------------
/data/multimodal_test.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.docx


--------------------------------------------------------------------------------
/data/multimodal_test.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.jpeg


--------------------------------------------------------------------------------
/data/multimodal_test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.pdf


--------------------------------------------------------------------------------
/data/multimodal_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.png


--------------------------------------------------------------------------------
/data/multimodal_test.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.pptx


--------------------------------------------------------------------------------
/data/multimodal_test.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.tiff


--------------------------------------------------------------------------------
/data/multimodal_test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.wav


--------------------------------------------------------------------------------
/data/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/table.png


--------------------------------------------------------------------------------
/data/table_test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/table_test.pdf


--------------------------------------------------------------------------------
/data/test-page-form.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/test-page-form.pdf


--------------------------------------------------------------------------------
/data/test-shapes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/test-shapes.pdf


--------------------------------------------------------------------------------
/data/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/test.pdf


--------------------------------------------------------------------------------
/data/woods_frost.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/woods_frost.docx


--------------------------------------------------------------------------------
/data/woods_frost.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/woods_frost.pdf


--------------------------------------------------------------------------------
/docker/scripts/entrypoint_devcontainer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash --login
 2 | # SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | #!/bin/bash
19 | 
20 | # Activate the `nv_ingest_runtime` conda environment
21 | . /opt/conda/etc/profile.d/conda.sh
22 | conda activate nv_ingest_runtime
23 | 
24 | # Source "source" file if it exists
25 | SRC_FILE="/opt/docker/bin/entrypoint_source"
26 | [ -f "${SRC_FILE}" ] && source "${SRC_FILE}"
27 | 
28 | # Check if user supplied a command
29 | if [ "$#" -gt 0 ]; then
30 |     # If a command is provided, run it
31 |     exec "$@"
32 | else
33 |     # If no command is provided, run the default startup launch
34 |     if [ "${MESSAGE_CLIENT_TYPE}" != "simple" ]; then
35 |       # Start uvicorn if MESSAGE_CLIENT_TYPE is not 'simple'
36 |       uvicorn nv_ingest.api.main:app --workers 1 --host 0.0.0.0 --port 7670 --reload --app-dir /workspace/src/nv_ingest &
37 |     fi
38 | 
39 |     python /workspace/src/microservice_entrypoint.py
40 | fi
41 | 


--------------------------------------------------------------------------------
/docker/scripts/entrypoint_source_ext.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 3 | # All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | 
 6 | set -e
 7 | 
 8 | # Run preparation tasks here
 9 | 
10 | if [ "$INSTALL_ADOBE_SDK" = "true" ]; then
11 |   echo "Checking if Adobe PDF Services SDK is installed..."
12 | 
13 |   # Check if pdfservices-sdk is installed
14 |   if ! python -c "import pkg_resources; pkg_resources.require('pdfservices-sdk~=4.0.0')" 2>/dev/null; then
15 |     echo "Installing Adobe PDF Services SDK..."
16 |     pip install "pdfservices-sdk~=4.0.0"
17 |   fi
18 | fi
19 | 
20 | # Check if audio dependencies should be installed
21 | if [ "$INSTALL_AUDIO_EXTRACTION_DEPS" = "true" ]; then
22 |   echo "Checking if librosa is installed..."
23 | 
24 |   # Check if librosa is installed
25 |   if ! python -c "import pkg_resources; pkg_resources.require('librosa')" 2>/dev/null; then
26 |     echo "Installing librosa using conda..."
27 |     mamba install -y -c conda-forge librosa
28 |   fi
29 | fi
30 | 
31 | # If MEM_TRACE is set in the environment, use mamba to install memray
32 | if [ -n "$MEM_TRACE" ]; then
33 |   echo "MEM_TRACE is set. Installing memray via mamba..."
34 |   mamba install -y conda-forge::memray
35 | fi
36 | 


--------------------------------------------------------------------------------
/docker/scripts/post_build_triggers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from transformers import AutoTokenizer
 3 | 
 4 | if os.getenv("DOWNLOAD_LLAMA_TOKENIZER") == "True":
 5 |     tokenizer_path = os.path.join(os.environ.get("MODEL_PREDOWNLOAD_PATH"), "llama-3.2-1b/tokenizer/")
 6 |     os.makedirs(tokenizer_path)
 7 | 
 8 |     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=os.getenv("HF_ACCESS_TOKEN"))
 9 |     tokenizer.save_pretrained(tokenizer_path)
10 | else:
11 |     tokenizer_path = os.path.join(os.environ.get("MODEL_PREDOWNLOAD_PATH"), "e5-large-unsupervised/tokenizer/")
12 |     os.makedirs(tokenizer_path)
13 | 
14 |     tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-unsupervised")
15 |     tokenizer.save_pretrained(tokenizer_path)
16 | 


--------------------------------------------------------------------------------
/docs/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM squidfunk/mkdocs-material:latest
2 | 
3 | # Install plugins.
4 | RUN apk add gcc python3-dev musl-dev linux-headers
5 | COPY requirements.txt /tmp/
6 | RUN pip install --disable-pip-version-check --no-cache-dir -r /tmp/requirements.txt
7 | 


--------------------------------------------------------------------------------
/docs/docs/assets/css/fonts.css:
--------------------------------------------------------------------------------
 1 | @font-face {
 2 |   font-family: "NVIDIA Sans";
 3 |   font-style: normal;
 4 |   src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Lt.woff2);
 5 |   font-weight: light;
 6 | }
 7 | 
 8 | @font-face {
 9 |   font-family: "NVIDIA Sans";
10 |   font-style: italic;
11 |   src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_LtIt.woff2);
12 |   font-weight: light;
13 | }
14 | 
15 | @font-face {
16 |   font-family: "NVIDIA Sans";
17 |   font-style: normal;
18 |   src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Rg.woff2);
19 |   font-weight: normal;
20 | }
21 | 
22 | @font-face {
23 |   font-family: "NVIDIA Sans";
24 |   font-style: italic;
25 |   src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_It.woff2);
26 |   font-weight: normal;
27 | }
28 | 
29 | @font-face {
30 |   font-family: "NVIDIA Sans";
31 |   font-style: normal;
32 |   src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Bd.woff2);
33 |   font-weight: bold;
34 | }
35 | 
36 | @font-face {
37 |   font-family: "NVIDIA Sans";
38 |   font-style: italic;
39 |   src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_BdIt.woff2);
40 |   font-weight: bold;
41 | }
42 | 


--------------------------------------------------------------------------------
/docs/docs/assets/css/jupyter-themes.css:
--------------------------------------------------------------------------------
 1 | /* theme: light */
 2 | body[data-md-color-scheme="light"] .jupyter-notebook {
 3 |   --jp-cell-editor-background: #f7f7f7;
 4 |   --jp-cell-editor-border-color: #cfcfcf;
 5 |   --jp-cell-prompt-fg-color: #303030;
 6 |   --jp-cell-prompt-bg-color: #f0f0f0;
 7 |   --jp-notebook-background: #ffffff;
 8 |   --jp-layout-color1: #ffffff;
 9 |   --jp-content-font-color1: #000000;
10 | }
11 | 
12 | /* theme: dark */
13 | body[data-md-color-scheme="dark"] .jupyter-notebook {
14 |   --jp-cell-editor-background: #2b2b2b;
15 |   --jp-cell-editor-border-color: #464646;
16 |   --jp-cell-prompt-fg-color: #d7d7d7;
17 |   --jp-cell-prompt-bg-color: #333333;
18 |   --jp-notebook-background: #1e1e1e;
19 |   --jp-layout-color1: #1e1e1e;
20 |   --jp-content-font-color1: #d4d4d4;
21 | }
22 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing to NV-Ingest
2 | 
3 | External contributions to NV-Ingest will be welcome soon, and they are greatly appreciated! 
4 | For more information, refer to [Contributing to NV-Ingest](https://github.com/NVIDIA/nv-ingest/blob/main/CONTRIBUTING.md).
5 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/example_processed_docs/text/multimodal_test.pdf.metadata.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/example_processed_docs/text/multimodal_test.pdf.metadata.json


--------------------------------------------------------------------------------
/docs/docs/extraction/helm.md:
--------------------------------------------------------------------------------
1 | # Deploy With Helm for NeMo Retriever Extraction
2 | 
3 | <!-- Use this documentation to deploy [NeMo Retriever extraction](overview.md) by using Helm. -->
4 | 
5 | To deploy [NeMo Retriever extraction](overview.md) by using Helm, 
6 | refer to [NV-Ingest Helm Charts](https://github.com/NVIDIA/nv-ingest/tree/main/helm).
7 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/images/audio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/audio.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/generate_personal_key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/generate_personal_key.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/image_viewer_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/image_viewer_example.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/overview-extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/overview-extraction.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/overview-retriever.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/overview-retriever.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/preview-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/preview-image.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/prometheus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/prometheus.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/test.pdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/test.pdf.png


--------------------------------------------------------------------------------
/docs/docs/extraction/images/zipkin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/zipkin.png


--------------------------------------------------------------------------------
/docs/docs/extraction/ngc-api-key.md:
--------------------------------------------------------------------------------
 1 | # Generate Your NGC Keys
 2 | 
 3 | NGC contains many public images, models, and datasets that can be pulled immediately without authentication. 
 4 | To push and pull custom images, you must generate a key and authenticate with NGC.
 5 | 
 6 | To create a key, go to [https://org.ngc.nvidia.com/setup/api-keys](https://org.ngc.nvidia.com/setup/api-keys).
 7 | 
 8 | When you create an NGC key, select the following for **Services Included**.
 9 | 
10 | - **NGC Catalog**
11 | - **Public API Endpoints**
12 | 
13 | !!! important
14 | 
15 |     Early Access participants must also select **Private Registry**.
16 | 
17 | ![Generate Personal Key](images/generate_personal_key.png)
18 | 
19 | 
20 | ## Docker Login to NGC
21 | 
22 | To pull the NIM container image from NGC, use your key to log in to the NGC registry by entering the following command and then following the prompts. 
23 | For the username, enter `$oauthtoken` exactly as shown. 
24 | It is a special authentication key for all users.
25 | 
26 | 
27 | ```shell
28 | $ docker login nvcr.io
29 | Username: $oauthtoken
30 | Password: <Your Key>
31 | ```
32 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/notebooks.md:
--------------------------------------------------------------------------------
 1 | # Notebooks for NeMo Retriever Extraction
 2 | 
 3 | To get started using [NeMo Retriever extraction](overview.md), you can try one of the ready-made notebooks that are available.
 4 | 
 5 | !!! note
 6 | 
 7 |     NeMo Retriever extraction is also known as NVIDIA Ingest and nv-ingest.
 8 | 
 9 | 
10 | To get started with the basics, try one of the following notebooks:
11 | 
12 | - [NV-Ingest: CLI Client Quick Start Guide](https://github.com/NVIDIA/nv-ingest/blob/main/client/client_examples/examples/cli_client_usage.ipynb)
13 | - [NV-Ingest: Python Client Quick Start Guide](https://github.com/NVIDIA/nv-ingest/blob/main/client/client_examples/examples/python_client_usage.ipynb)
14 | 
15 | For more advanced scenarios, try one of the following notebooks:
16 | 
17 | - [Try out the NVIDIA Multimodal PDF Data Extraction Blueprint](https://github.com/NVIDIA/nv-ingest/blob/main/deploy/pdf-blueprint.ipynb)
18 | - [Evaluate bo767 retrieval recall accuracy with NV-Ingest and Milvus](https://github.com/NVIDIA/nv-ingest/blob/main/evaluation/bo767_recall.ipynb)
19 | - [Multimodal RAG with LangChain](https://github.com/NVIDIA/nv-ingest/blob/main/examples/langchain_multimodal_rag.ipynb)
20 | - [Multimodal RAG with LlamaIndex](https://github.com/NVIDIA/nv-ingest/blob/main/examples/llama_index_multimodal_rag.ipynb)
21 | 
22 | 
23 | 
24 | ## Related Topics
25 | 
26 | - [Prerequisites](prerequisites.md)
27 | - [Support Matrix](support-matrix.md)
28 | - [Deploy Without Containers (Library Mode)](quickstart-library-mode.md)
29 | - [Deploy With Docker Compose (Self-Hosted)](quickstart-guide.md)
30 | - [Deploy With Helm](helm.md)
31 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/prerequisites.md:
--------------------------------------------------------------------------------
 1 | # Prerequisites for NeMo Retriever Extraction
 2 | 
 3 | Before you begin using [NeMo Retriever extraction](overview.md), ensure the following software prerequisites are met.
 4 | 
 5 | 
 6 | ## Software
 7 | 
 8 | - Linux operating systems (Ubuntu 22.04 or later recommended)
 9 | - [Docker](https://docs.docker.com/engine/install/)
10 | - [Docker Compose](https://docs.docker.com/compose/install/)
11 | - [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) (NVIDIA Driver >= `535`, CUDA >= `12.2`)
12 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
13 | - [Conda Python environment and package manager](https://github.com/conda-forge/miniforge)
14 | 
15 | 
16 | !!! note
17 | 
18 |     You install Python later. NV-Ingest only supports [Python version 3.10](https://www.python.org/downloads/release/python-3100/).
19 | 
20 | 
21 | ## Related Topics
22 | 
23 | - [Support Matrix](support-matrix.md)
24 | - [Deploy Without Containers (Library Mode)](quickstart-library-mode.md)
25 | - [Deploy With Docker Compose (Self-Hosted)](quickstart-guide.md)
26 | - [Deploy With Helm](helm.md)
27 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/releasenotes-nv-ingest.md:
--------------------------------------------------------------------------------
 1 | # Release Notes for NeMo Retriever Extraction
 2 | 
 3 | This documentation contains the release notes for [NeMo Retriever extraction](overview.md).
 4 | 
 5 | !!! note
 6 | 
 7 |     NeMo Retriever extraction is also known as NVIDIA Ingest and nv-ingest.
 8 | 
 9 | ## Release 25.03
10 | 
11 | ### Summary
12 | 
13 | The NeMo Retriever extraction 25.03 release includes accuracy improvements, feature expansions, and throughput improvements.
14 | 
15 | ## New Features
16 | 
17 | - Consolidated NeMo Retriever extraction to run on a single GPU (H100, A100, L40S, or A10G). For details, refer to [Support Matrix](support-matrix.md).
18 | - Added Library Mode for a lightweight no-GPU deployment that uses NIM endpoints hosted on build.nvidia.com. For details, refer to [Deploy Without Containers (Library Mode)](quickstart-library-mode.md).
19 | - Added support for infographics extraction.
20 | - Added support for RIVA NIM for Audio extraction (Early Access). For details, refer to [Audio Processing](audio.md).
21 | - Added support for Llama-3.2 VLM for Image Captioning capability.
22 | - docX, pptx, jpg, png support for image detection & extraction.
23 | - Deprecated DePlot and CACHED NIMs.
24 | <!-- - Integrated with nemoretriever-parse NIM for state-of-the-art text extraction -->
25 | <!-- - Integrated with new NVIDIA NIMs -->
26 | <!--   - Nemoretriever-table-structure-v1 -->
27 | <!--   - Nemoretriever-graphic-elements-v1 -->
28 | <!--   - Nemoretriever-page-elements-v2 -->
29 | 
30 | 
31 | 
32 | 
33 | ## Release 24.12.1
34 | 
35 | ### Bug fixes
36 | 
37 | Cases where .split() tasks fail during ingestion are now fixed.
38 | 
39 | 
40 | ## Release 24.12
41 | 
42 | ### Known Issues
43 | 
44 | We currently do not support OCR-based text extraction. This was discovered in an unsupported use case and is not a product functionality issue.
45 | 


--------------------------------------------------------------------------------
/docs/docs/extraction/telemetry.md:
--------------------------------------------------------------------------------
 1 | # Telemetry with NeMo Retriever Extraction
 2 | 
 3 | You can view telemetry data for [NeMo Retriever extraction](overview.md).
 4 | 
 5 | !!! note
 6 | 
 7 |     NeMo Retriever extraction is also known as NVIDIA Ingest and nv-ingest.
 8 | 
 9 | 
10 | ## OpenTelemetry
11 | 
12 | After OpenTelemetry and Zipkin are running, you can open your browser to explore traces: 
13 | 
14 | - **Docker** — Use http://$YOUR_DOCKER_HOST:9411/zipkin/ 
15 | - **Kubernetes** — Use http://$YOUR_K8S_OTEL_POD:9411/zipkin/
16 | 
17 | ![](images/zipkin.png)
18 | 
19 | ## Prometheus
20 | 
21 | After Prometheus is running, you can open your browser to explore metrics: 
22 | 
23 | - **Docker** — Use http://$YOUR_DOCKER_HOST:9090/ziplin/
24 | - **Kubernetes** — Use http://$YOUR_K8S_OTEL_POD:9090/zipkin/
25 | 
26 | ![](images/prometheus.png)
27 | 


--------------------------------------------------------------------------------
/docs/overrides/main.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 |  
 3 | 
 4 | 
 5 | {% block extrahead %}
 6 |  
 7 |     <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"></script>
 8 |  
 9 | {% endblock %}
10 | 
11 | 
12 | 
13 | {% block footer %}
14 | 
15 | {{ super() }}
16 | 
17 |     <script type="text/javascript">if (typeof _satellite !== "undefined") {_satellite.pageBottom();}</script>
18 |  
19 | {% endblock %}
20 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | mkdocs-material
 2 | mkdocs-macros-plugin
 3 | mkdocs-minify-plugin
 4 | mkdocstrings[python]
 5 | mkdocs-gen-files
 6 | pymdown-extensions
 7 | mkdocs-jupyter
 8 | mkdocs-include-dir-to-nav
 9 | mkdocs-literate-nav
10 | mkdocs-site-urls
11 | mkdocs-redirects
12 | myst-parser
13 | nvidia-sphinx-theme
14 | sphinx
15 | sphinx-markdown-builder
16 | sphinx-rtd-theme
17 | swagger-plugin-for-sphinx
18 | 


--------------------------------------------------------------------------------
/docs/scripts/generate_openapi_docs.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import yaml
 3 | import click
 4 | import os
 5 | 
 6 | from nv_ingest.api.main import app
 7 | 
 8 | 
 9 | @click.command()
10 | @click.option("--output", default="openapi.yaml", help="Path to OpenAPI output file (default: openapi.json)")
11 | def write_openapi_schema(output):
12 |     if os.path.isdir(output):
13 |         print(f"Warning: '{output}' is a directory. Defaulting to '{output}/openapi.yaml'.")
14 |         output = os.path.join(output, "openapi.yaml")
15 | 
16 |     # Determine format based on file extension
17 |     if output.endswith(".yaml") or output.endswith(".yml"):
18 |         with open(output, "w") as f:
19 |             yaml.dump(app.openapi(), f, default_flow_style=False)
20 |         print(f"OpenAPI YAML written to: {output}")
21 |     else:
22 |         with open(output, "w") as f:
23 |             json.dump(app.openapi(), f, indent=4)
24 |         print(f"OpenAPI JSON written to: {output}")
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     write_openapi_schema()
29 | 


--------------------------------------------------------------------------------
/docs/sphinx_docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | sys.path.insert(0, os.path.abspath("../../../api/src"))  # nv-ingest-api src
 9 | sys.path.insert(1, os.path.abspath("../../../client/src"))  # nv-ingest-client src
10 | sys.path.insert(2, os.path.abspath("../../../src"))  # nv-ingest src
11 | 
12 | project = "nv-ingest"
13 | copyright = "2025, Nvidia"
14 | author = "Nvidia"
15 | release = "24.12"
16 | 
17 | # -- General configuration ---------------------------------------------------
18 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
19 | 
20 | extensions = [
21 |     "myst_parser",
22 |     "sphinx.ext.autodoc",
23 |     "sphinx.ext.autosummary",
24 |     "sphinx.ext.napoleon",
25 |     "sphinx.ext.viewcode",
26 |     "swagger_plugin_for_sphinx",
27 | ]
28 | 
29 | templates_path = ["_templates"]
30 | exclude_patterns = []
31 | 
32 | 
33 | # -- Options for HTML output -------------------------------------------------
34 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
35 | 
36 | html_theme = "nvidia_sphinx_theme"
37 | 
38 | html_theme_options = {
39 |     "header_links": [
40 |         ("Home", "index"),
41 |         ("GitHub", "https://github.com/NVIDIA/nvidia-sphinx-theme", True, "fab fa-github"),
42 |     ],
43 |     "footer_links": [
44 |         ("Privacy Policy", "https://www.nvidia.com/en-us/about-nvidia/privacy-policy/"),
45 |         ("Terms of Use", "https://www.nvidia.com/en-us/about-nvidia/legal-info/"),
46 |     ],
47 |     "show_prev_next": True,  # Show next/previous buttons at bottom
48 | }
49 | 
50 | html_static_path = ["_static"]
51 | 


--------------------------------------------------------------------------------
/docs/sphinx_docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | API reference
 3 | ===============
 4 | 
 5 | Provides API references for the `nv-ingest-api`, `nv-ingest-client`, and `nv-ingest` modules. 
 6 | 
 7 | .. toctree::
 8 |     :maxdepth: 2
 9 |     :caption: NV-Ingest Packages
10 | 
11 |     nv-ingest-api/modules.rst
12 |     nv-ingest-client/modules.rst
13 |     nv-ingest/modules.rst
14 |     
15 | 


--------------------------------------------------------------------------------
/docs/sphinx_docs/source/openapi.rst:
--------------------------------------------------------------------------------
1 | ==================================
2 | NV-Ingest OpenAPI reference
3 | ==================================
4 | 
5 | .. swagger-plugin:: openapi.yaml
6 |     :id: nv-ingest-openapi
7 |     :page-title: NV-Ingest OpenAPI Reference
8 | 


--------------------------------------------------------------------------------
/examples/launch_libmode_service.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import logging
 6 | import os
 7 | import sys
 8 | 
 9 | from nv_ingest.framework.orchestration.morpheus.util.pipeline.pipeline_runners import (
10 |     PipelineCreationSchema,
11 |     start_pipeline_subprocess,
12 | )
13 | from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging
14 | 
15 | # Configure the logger
16 | logger = logging.getLogger(__name__)
17 | 
18 | local_log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
19 | if local_log_level in ("DEFAULT",):
20 |     local_log_level = "INFO"
21 | 
22 | configure_local_logging(logger, local_log_level)
23 | 
24 | 
25 | def main():
26 |     try:
27 |         # Possibly override config parameters
28 |         config_data = {}
29 | 
30 |         # Filter out None values to let the schema defaults handle them
31 |         config_data = {key: value for key, value in config_data.items() if value is not None}
32 | 
33 |         # Construct the pipeline configuration
34 |         config = PipelineCreationSchema(**config_data)
35 | 
36 |         # Start the pipeline subprocess
37 |         pipeline_process = start_pipeline_subprocess(config, stderr=sys.stderr, stdout=sys.stdout)
38 | 
39 |         pipeline_process.wait()
40 | 
41 |         # The main program will exit, and the atexit handler will terminate the subprocess group
42 | 
43 |     except Exception as e:
44 |         logger.error(f"Error running pipeline subprocess or ingestion: {e}")
45 | 
46 |         # The atexit handler will ensure subprocess termination
47 |         sys.exit(1)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 
25 | # Ignore shell scripts
26 | update_helm_readme.sh
27 | 
28 | # Ignore temporary files
29 | *.tmp
30 | *.prefix
31 | *.suffix
32 | 
33 | # Ignore editor files
34 | *.swp
35 | *.swo
36 | *~
37 | 
38 | # Ignore OS files
39 | .DS_Store
40 | Thumbs.db
41 | 
42 | # Ignore time-slicing directory
43 | time-slicing/
44 | 
45 | # Ignore helm-docs template file
46 | README.md.gotmpl
47 | 


--------------------------------------------------------------------------------
/helm/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # CHANGELOG
 3 | 
 4 | ## v0.2.24 - 15 Aug 2024
 5 | 
 6 | Update the handling of the default for the persistent volume name
 7 | 
 8 | ## v0.2.23 - 15 Aug 2024
 9 | 
10 | Update for otel env vars
11 | 


--------------------------------------------------------------------------------
/helm/Chart.lock:
--------------------------------------------------------------------------------
 1 | dependencies:
 2 | - name: nvidia-nim-nemoretriever-page-elements-v2
 3 |   repository: https://helm.ngc.nvidia.com/nim/nvidia
 4 |   version: 1.2.0
 5 | - name: nvidia-nim-nemoretriever-graphic-elements-v1
 6 |   repository: https://helm.ngc.nvidia.com/nim/nvidia
 7 |   version: 1.2.0
 8 | - name: nvidia-nim-nemoretriever-table-structure-v1
 9 |   repository: https://helm.ngc.nvidia.com/nim/nvidia
10 |   version: 1.2.0
11 | - name: nim-vlm
12 |   repository: https://helm.ngc.nvidia.com/nvidia/nemo-microservices
13 |   version: 1.2.0-ea-v2
14 | - name: nvidia-nim-paddleocr
15 |   repository: https://helm.ngc.nvidia.com/nim/baidu
16 |   version: 1.2.0
17 | - name: nvidia-nim-nv-embedqa-e5-v5
18 |   repository: https://helm.ngc.nvidia.com/nim/nvidia
19 |   version: 1.5.0
20 | - name: nvidia-nim-llama-32-nv-embedqa-1b-v2
21 |   repository: https://helm.ngc.nvidia.com/nim/nvidia
22 |   version: 1.5.0
23 | - name: riva-nim
24 |   repository: https://helm.ngc.nvidia.com/nim/nvidia
25 |   version: 1.0.0
26 | - name: milvus
27 |   repository: https://zilliztech.github.io/milvus-helm
28 |   version: 4.1.11
29 | - name: redis
30 |   repository: oci://registry-1.docker.io/bitnamicharts
31 |   version: 19.1.3
32 | - name: zipkin
33 |   repository: https://zipkin.io/zipkin-helm
34 |   version: 0.1.2
35 | - name: opentelemetry-collector
36 |   repository: https://open-telemetry.github.io/opentelemetry-helm-charts
37 |   version: 0.78.1
38 | digest: sha256:7675e65058740aa9ab90e4f3b458f226bd2dd9a992a3ea7353dd2de6f732a26f
39 | generated: "2025-05-01T10:32:44.178383534-04:00"
40 | 


--------------------------------------------------------------------------------
/helm/LICENSE:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/helm/mig/nv-ingest-mig-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: nv-ingest-mig-config
 5 | data:
 6 |   config.yaml: |
 7 |     version: v1
 8 |     mig-configs:
 9 |       all-disabled:
10 |         - devices: all
11 |           mig-enabled: false
12 | 
13 |       single-gpu-nv-ingest:
14 |         - devices: [0]
15 |           mig-enabled: true
16 |           mig-devices:
17 |             "1g.10gb": 7
18 |         - devices: [1]
19 |           mig-enabled: true
20 |           mig-devices:
21 |             "7g.80gb": 1
22 |         - devices: [2]
23 |           mig-enabled: true
24 |           mig-devices:
25 |             "7g.80gb": 1
26 |         - devices: [3]
27 |           mig-enabled: true
28 |           mig-devices:
29 |             "7g.80gb": 1
30 |         - devices: [4]
31 |           mig-enabled: true
32 |           mig-devices:
33 |             "7g.80gb": 1
34 |         - devices: [5]
35 |           mig-enabled: true
36 |           mig-devices:
37 |             "7g.80gb": 1
38 |         - devices: [6]
39 |           mig-enabled: true
40 |           mig-devices:
41 |             "7g.80gb": 1
42 |         - devices: [7]
43 |           mig-enabled: true
44 |           mig-devices:
45 |             "7g.80gb": 1
46 | 


--------------------------------------------------------------------------------
/helm/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | Installed {{ .Chart.Name }}-{{ .Chart.Version }}, named {{ .Release.Name }}.
 2 | Visit the application via:
 3 | {{- if .Values.ingress.enabled }}
 4 | {{- range $host := .Values.ingress.hosts }}
 5 |   {{- range .paths }}
 6 |   http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
 7 |   {{- end }}
 8 | {{- end }}
 9 | {{- else  if and .Values.virtualService .Values.virtualService.enabled }}
10 |   https://{{ .Values.virtualService.dnsName }}
11 | {{- end }}
12 | 
13 | To learn more about the release, try:
14 | 
15 |   $ helm status {{ .Release.Name }}
16 |   $ helm get {{ .Release.Name }}
17 |   $ helm test {{ .Release.Name }}
18 | 


--------------------------------------------------------------------------------
/helm/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | {{- if not .Values.extraEnvVarsCM }}
2 | ---
3 | apiVersion: v1
4 | kind: ConfigMap
5 | metadata:
6 |   name: {{ include "nv-ingest.fullname" . }}
7 | data:
8 | {{- end }}
9 | 


--------------------------------------------------------------------------------
/helm/templates/hpa.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.autoscaling.enabled }}
 2 | ---
 3 | apiVersion: autoscaling/v2
 4 | kind: HorizontalPodAutoscaler
 5 | metadata:
 6 |   name: {{ include "nv-ingest.fullname" . }}
 7 |   labels:
 8 |     {{- include "nv-ingest.labels" . | nindent 4 }}
 9 | spec:
10 |   scaleTargetRef:
11 |     apiVersion: apps/v1
12 |     kind: Deployment
13 |     name: {{ include "nv-ingest.fullname" . }}
14 |   minReplicas: {{ .Values.autoscaling.minReplicas }}
15 |   maxReplicas: {{ .Values.autoscaling.maxReplicas }}
16 |   metrics:
17 |    {{- range .Values.autoscaling.metrics }}
18 |       - {{- . | toYaml | nindent 10 }}
19 |     {{- end }}
20 | {{- end }}
21 | 


--------------------------------------------------------------------------------
/helm/templates/secrets.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | {{ if .Values.ngcImagePullSecret.create -}}
 6 | ---
 7 | apiVersion: v1
 8 | kind: Secret
 9 | metadata:
10 |   name: ngc-secret  # name expected by NIMs
11 | type: kubernetes.io/dockerconfigjson
12 | data:
13 |   .dockerconfigjson: {{ template "nv-ingest.ngcImagePullSecret" . }}
14 | {{- end }}
15 | 
16 | 
17 | {{ if and .Values.ngcApiSecret.create -}}
18 | ---
19 | apiVersion: v1
20 | kind: Secret
21 | metadata:
22 |   name: ngc-api  # Name expected by NIMs
23 | type: Opaque
24 | stringData:
25 |   NGC_CLI_API_KEY: {{ template "nv-ingest.ngcApiSecret" . }}
26 |   NGC_API_KEY: {{ template "nv-ingest.ngcApiSecret" . }}
27 | {{- end }}
28 | 


--------------------------------------------------------------------------------
/helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: {{ .Values.service.name | default (include "nv-ingest.fullname" .) }}
 6 |   labels:
 7 |     {{- include "nv-ingest.labels" . | nindent 4 }}
 8 |     {{- if .Values.service.labels }}
 9 |     {{- toYaml .Values.service.labels | nindent 4 }}
10 |     {{- end }}
11 |   annotations:
12 |     {{- if .Values.service.annotations }}
13 |     {{- toYaml .Values.service.annotations | nindent 4 }}
14 |     {{- end }}
15 | spec:
16 |   type: {{ .Values.service.type }}
17 |   ports:
18 |     {{- if .Values.service.port }}
19 |     - port: {{ .Values.service.port }}
20 |       targetPort: http
21 |       protocol: TCP
22 |       name: nv-ingest-http
23 |     {{- end }}
24 |     {{- if .Values.service.nodePort }}
25 |       {{- with .Values.service.nodePort }}
26 |       nodePort:  {{ . }}
27 |       {{- end }}
28 |       {{- end }}
29 |   selector:
30 |     {{- include "nv-ingest.selectorLabels" . | nindent 4 }}
31 | 


--------------------------------------------------------------------------------
/helm/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | ---
 3 | apiVersion: v1
 4 | kind: ServiceAccount
 5 | metadata:
 6 |   name: {{ include "nv-ingest.serviceAccountName" . }}
 7 |   labels:
 8 |     {{- include "nv-ingest.labels" . | nindent 4 }}
 9 |   {{- with .Values.serviceAccount.annotations }}
10 |   annotations:
11 |     {{- toYaml . | nindent 4 }}
12 |   {{- end }}
13 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
14 | {{- end }}
15 | 


--------------------------------------------------------------------------------
/helm/time-slicing/time-slicing-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: time-slicing-config
 5 | data:
 6 |   any: |-
 7 |     version: v1
 8 |     flags:
 9 |       migStrategy: none
10 |     sharing:
11 |       timeSlicing:
12 |         renameByDefault: false
13 |         failRequestsGreaterThanOne: false
14 |         resources:
15 |           - name: nvidia.com/gpu
16 |             replicas: 16
17 | 


--------------------------------------------------------------------------------
/helm/update_helm_readme.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # script for updating the helm README.md file with the helm-docs
4 | # More complicated tasks are envisioned and hence the existence of this script given its simple nature
5 | 
6 | helm-docs
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 120
3 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths =
3 |     api/tests
4 |     tests
5 | markers =
6 |     integration: mark a test as an integration test
7 | addopts = -m "not integration"
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import datetime
 6 | import os
 7 | import re
 8 | 
 9 | from setuptools import find_packages
10 | from setuptools import setup
11 | 
12 | 
13 | def get_version():
14 |     release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev")
15 |     version = os.getenv("NV_INGEST_VERSION")
16 |     rev = os.getenv("NV_INGEST_REV", "0")
17 | 
18 |     if not version:
19 |         version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
20 | 
21 |     # Ensure the version is PEP 440 compatible
22 |     pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23 |     if not re.match(pep440_regex, version):
24 |         raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25 | 
26 |     # Construct the final version string
27 |     if release_type == "dev":
28 |         final_version = f"{version}.dev{rev}"
29 |     elif release_type == "release":
30 |         final_version = f"{version}.post{rev}" if int(rev) > 0 else version
31 |     else:
32 |         raise ValueError(f"Invalid release type: {release_type}")
33 | 
34 |     return final_version
35 | 
36 | 
37 | def read_requirements(file_name):
38 |     """Read a requirements file and return a list of its packages."""
39 |     with open(file_name) as f:
40 |         return f.read().splitlines()
41 | 
42 | 
43 | # Specify your requirements files
44 | requirements_files = []
45 | 
46 | # Read and combine requirements from all specified files
47 | combined_requirements = []
48 | for file in requirements_files:
49 |     combined_requirements.extend(read_requirements(file))
50 | 
51 | combined_requirements = list(set(combined_requirements))
52 | 
53 | setup(
54 |     author="Devin Robison",
55 |     author_email="drobison@nvidia.com",
56 |     classifiers=[],
57 |     description="Python module supporting document ingestion",
58 |     install_requires=combined_requirements,
59 |     license="Apache-2.0",
60 |     name="nv_ingest",
61 |     package_dir={"": "src"},
62 |     packages=find_packages(where="src"),
63 |     python_requires=">=3.10",
64 |     version=get_version(),
65 | )
66 | 


--------------------------------------------------------------------------------
/skaffold/README.md:
--------------------------------------------------------------------------------
1 | # Skaffold - NV-Ingest Development Team Only
2 | 
3 | 
4 | Skaffold is intended to support the NV-Ingest development team with Kubernetes development and testing. It is not meant to be used in production deployments nor for local testing.
5 | 
6 | We offer Kubernetes support through Helm and you can find those instructions at [Helm Documentation](../helm/README.md).
7 | 


--------------------------------------------------------------------------------
/skaffold/sensitive/.gitignore:
--------------------------------------------------------------------------------
1 | *.yaml
2 | 


--------------------------------------------------------------------------------
/src/ingest_pipeline_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "image_caption_extraction_module": {},
 3 |   "image_storage_module": {},
 4 |   "metadata_injection_module": {},
 5 |   "pdf_extractor_module": {},
 6 |   "redis_task_sink": {},
 7 |   "redis_task_source": {},
 8 |   "text_splitting_module": {},
 9 |   "otel_meter_module": {},
10 |   "embed_extractions_module": {},
11 |   "vdb_task_sink_module": {}
12 | }
13 | 


--------------------------------------------------------------------------------
/src/nv_ingest/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import warnings
 6 | 
 7 | 
 8 | # Suppressing CUDA-related warnings when running NV-Ingest on a CPU-only system.
 9 | #
10 | # The warnings originate from Numba, which attempts to initialize CUDA even if no GPU is available.
11 | # These warnings include errors about missing CUDA drivers or failing to dlopen `libcuda.so.1`.
12 | #
13 | # By temporarily ignoring `UserWarning` during the import, we prevent unnecessary clutter in logs
14 | # while ensuring that cuDF still functions in CPU mode.
15 | #
16 | # Note: This does not affect cuDF behavior - it will still fall back to CPU execution if no GPU is detected.
17 | with warnings.catch_warnings():
18 |     warnings.simplefilter("ignore", category=UserWarning)
19 |     import cudf
20 | 


--------------------------------------------------------------------------------
/src/nv_ingest/api/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/api/main.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import logging
 6 | import os
 7 | 
 8 | from fastapi import FastAPI
 9 | from opentelemetry import trace
10 | from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
11 | from opentelemetry.sdk.resources import Resource
12 | from opentelemetry.sdk.trace import TracerProvider
13 | from opentelemetry.sdk.trace.export import BatchSpanProcessor
14 | 
15 | from .v1.health import router as HealthApiRouter
16 | from .v1.ingest import router as IngestApiRouter
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | # nv-ingest FastAPI app declaration
21 | app = FastAPI(
22 |     title="NV-Ingest Microservice",
23 |     description="Service for ingesting heterogenous datatypes",
24 |     version="25.4.2",
25 |     contact={
26 |         "name": "NVIDIA Corporation",
27 |         "url": "https://nvidia.com",
28 |     },
29 |     docs_url="/docs",
30 | )
31 | 
32 | app.include_router(IngestApiRouter, prefix="/v1")
33 | app.include_router(HealthApiRouter, prefix="/v1/health")
34 | 
35 | # Set up the tracer provider and add a processor for exporting traces
36 | resource = Resource(attributes={"service.name": "nv-ingest"})
37 | trace.set_tracer_provider(TracerProvider(resource=resource))
38 | tracer = trace.get_tracer(__name__)
39 | 
40 | otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "otel-collector:4317")
41 | exporter = OTLPSpanExporter(endpoint=otel_endpoint, insecure=True)
42 | span_processor = BatchSpanProcessor(exporter)
43 | trace.get_tracer_provider().add_span_processor(span_processor)
44 | 


--------------------------------------------------------------------------------
/src/nv_ingest/api/v1/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/injectors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from .metadata_injector import MetadataInjectorLoaderFactory
6 | from .task_injection import TaskInjectorLoaderFactory
7 | 
8 | __all__ = ["MetadataInjectorLoaderFactory", "TaskInjectorLoaderFactory"]
9 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/injectors/task_injection.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | import mrc
 9 | from morpheus.utils.module_utils import ModuleLoaderFactory
10 | from morpheus.utils.module_utils import register_module
11 | 
12 | from nv_ingest.framework.schemas.framework_task_injection_schema import TaskInjectionSchema
13 | from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_context_manager
14 | from nv_ingest.framework.orchestration.morpheus.util.modules.config_validator import (
15 |     fetch_and_validate_module_config,
16 | )
17 | from nv_ingest_api.internal.primitives.tracing.tagging import traceable
18 | from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
19 | 
20 | logger = logging.getLogger(__name__)
21 | 
22 | MODULE_NAME = "task_injection"
23 | MODULE_NAMESPACE = "nv_ingest"
24 | 
25 | TaskInjectorLoaderFactory = ModuleLoaderFactory(MODULE_NAME, MODULE_NAMESPACE, TaskInjectionSchema)
26 | 
27 | 
28 | def on_data(message: IngestControlMessage):
29 |     message.get_metadata("task_meta")
30 | 
31 |     return message
32 | 
33 | 
34 | @register_module(MODULE_NAME, MODULE_NAMESPACE)
35 | def _task_injection(builder: mrc.Builder):
36 |     validated_config = fetch_and_validate_module_config(builder, TaskInjectionSchema)
37 | 
38 |     @nv_ingest_node_failure_context_manager(
39 |         annotation_id=MODULE_NAME,
40 |         raise_on_failure=validated_config.raise_on_failure,
41 |     )
42 |     @traceable(MODULE_NAME)
43 |     def _on_data(ctrl_msg: IngestControlMessage):
44 |         return on_data(ctrl_msg)
45 |         ctrl_msg.get_metadata("task_meta")
46 | 
47 |         return ctrl_msg
48 | 
49 |     node = builder.make_node("vdb_resource_tagging", on_data)
50 | 
51 |     builder.register_module_input("input", node)
52 |     builder.register_module_output("output", node)
53 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/sinks/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from .message_broker_task_sink import MessageBrokerTaskSinkLoaderFactory
6 | 
7 | __all__ = ["MessageBrokerTaskSinkLoaderFactory"]
8 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/sources/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from .message_broker_task_source import MessageBrokerTaskSourceLoaderFactory
6 | 
7 | __all__ = ["MessageBrokerTaskSourceLoaderFactory"]
8 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/storages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/nv_ingest/framework/orchestration/morpheus/modules/storages/__init__.py


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/telemetry/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/nv_ingest/framework/orchestration/morpheus/modules/telemetry/__init__.py


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/modules/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | from .text_splitter import TextSplitterLoaderFactory
6 | 
7 | __all__ = ["TextSplitterLoaderFactory"]
8 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/meta/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/meta/linear_module_source_stage_cpu.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from morpheus.config import ExecutionMode
 6 | from morpheus.stages.general.linear_modules_source import LinearModuleSourceStage
 7 | from morpheus.stages.general.linear_modules_stage import LinearModulesStage
 8 | 
 9 | 
10 | class LinearModuleSourceStageCPU(LinearModuleSourceStage):
11 |     def supported_execution_modes(self) -> tuple[ExecutionMode]:
12 |         # Provide your own logic here; for example:
13 |         return (ExecutionMode.CPU,)
14 | 
15 | 
16 | class LinearModuleStageCPU(LinearModulesStage):
17 |     def supported_execution_modes(self) -> tuple[ExecutionMode]:
18 |         # Provide your own logic here; for example:
19 |         return (ExecutionMode.CPU,)
20 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/mutate/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from .image_dedup import generate_dedup_stage
 7 | from .image_filter import generate_image_filter_stage
 8 | 
 9 | __all__ = ["generate_dedup_stage", "generate_image_filter_stage"]
10 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/store/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/stages/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | 
6 | from .image_caption_extraction import generate_caption_extraction_stage
7 | 
8 | __all__ = ["generate_caption_extraction_stage"]
9 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/util/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/util/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/util/modules/config_validator.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import logging
 6 | 
 7 | import mrc
 8 | from pydantic import ValidationError
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def fetch_and_validate_module_config(builder: mrc.Builder, schema_class):
14 |     """
15 |     Validates the configuration of a module using a specified Pydantic schema class.
16 | 
17 |     Parameters
18 |     ----------
19 |     builder : object
20 |         The builder object used to access the current module's configuration.
21 |     schema_class : Pydantic BaseModel
22 |         The schema class to be used for validating the module configuration.
23 | 
24 |     Raises
25 |     ------
26 |     ValueError
27 |         If the module configuration fails validation according to the schema class.
28 |     """
29 |     module_config = builder.get_current_module_config()
30 |     try:
31 |         validated_config = schema_class(**module_config)
32 |     except ValidationError as e:
33 |         error_messages = "; ".join([f"{error['loc'][0]}: {error['msg']}" for error in e.errors()])
34 |         log_error_message = f"Invalid configuration: {error_messages}"
35 |         logger.error(log_error_message)
36 |         raise ValueError(log_error_message)
37 | 
38 |     return validated_config
39 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/util/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from .pipeline_builders import setup_ingestion_pipeline
 6 | from .stage_builders import (
 7 |     add_sink_stage,
 8 |     add_source_stage,
 9 |     add_submitted_job_counter_stage,
10 |     add_metadata_injector_stage,
11 |     add_pdf_extractor_stage,
12 |     add_image_extractor_stage,
13 |     add_docx_extractor_stage,
14 |     add_pptx_extractor_stage,
15 |     add_image_dedup_stage,
16 |     add_image_filter_stage,
17 |     add_table_extractor_stage,
18 |     add_chart_extractor_stage,
19 |     add_image_caption_stage,
20 |     add_text_splitter_stage,
21 |     add_embed_extractions_stage,
22 |     add_embedding_storage_stage,
23 |     add_image_storage_stage,
24 |     add_vdb_task_sink_stage,
25 | )
26 | 
27 | __all__ = [
28 |     "setup_ingestion_pipeline",
29 |     "add_sink_stage",
30 |     "add_source_stage",
31 |     "add_submitted_job_counter_stage",
32 |     "add_metadata_injector_stage",
33 |     "add_pdf_extractor_stage",
34 |     "add_image_extractor_stage",
35 |     "add_docx_extractor_stage",
36 |     "add_pptx_extractor_stage",
37 |     "add_image_dedup_stage",
38 |     "add_image_filter_stage",
39 |     "add_table_extractor_stage",
40 |     "add_chart_extractor_stage",
41 |     "add_image_caption_stage",
42 |     "add_text_splitter_stage",
43 |     "add_embed_extractions_stage",
44 |     "add_embedding_storage_stage",
45 |     "add_image_storage_stage",
46 |     "add_vdb_task_sink_stage",
47 | ]
48 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/orchestration/morpheus/util/pipeline/logging.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from morpheus.utils.logger import configure_logging
 7 | 
 8 | from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging
 9 | from nv_ingest.framework.orchestration.morpheus.util.pipeline.stage_builders import *
10 | 
11 | # Convert log level from string to logging level
12 | _log_level_mapping = {
13 |     "DEBUG": logging.DEBUG,
14 |     "INFO": logging.INFO,
15 |     "WARNING": logging.WARNING,
16 |     "ERROR": logging.ERROR,
17 |     "CRITICAL": logging.CRITICAL,
18 | }
19 | 
20 | 
21 | def get_log_level(str_level):
22 |     """
23 |     Converts the log level from a string to a logging level.
24 |     """
25 |     return _log_level_mapping.get(str_level.upper(), logging.INFO)
26 | 
27 | 
28 | def setup_logging(log_level):
29 |     """
30 |     Configures logging based on the provided log level or the INGEST_LOG_LEVEL environment variable.
31 |     """
32 |     # Check for INGEST_LOG_LEVEL environment variable
33 |     env_log_level = os.getenv("INGEST_LOG_LEVEL", log_level)
34 |     if env_log_level:
35 |         log_level = env_log_level
36 |         if log_level in ("DEFAULT",):
37 |             log_level = "INFO"
38 | 
39 |     log_level_value = _log_level_mapping.get(log_level.upper(), logging.INFO)
40 |     logging.basicConfig(level=log_level_value, format="%(asctime)s - %(levelname)s - %(message)s")
41 |     configure_logging(log_level=log_level_value)
42 |     configure_local_logging(logger, log_level_value)
43 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/nv_ingest/framework/schemas/__init__.py


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_job_counter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import ConfigDict, BaseModel
 7 | 
 8 | 
 9 | class JobCounterSchema(BaseModel):
10 |     name: str = "job_counter"
11 |     raise_on_failure: bool = False
12 |     model_config = ConfigDict(extra="forbid")
13 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import Field, BaseModel
 7 | 
 8 | from typing_extensions import Annotated
 9 | 
10 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
11 | 
12 | 
13 | class MessageBrokerTaskSinkSchema(BaseModel):
14 |     broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
15 | 
16 |     raise_on_failure: bool = False
17 | 
18 |     progress_engines: Annotated[int, Field(ge=1)] = 6
19 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_message_broker_source_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import Field, BaseModel
 7 | 
 8 | from typing_extensions import Annotated
 9 | 
10 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
11 | 
12 | 
13 | class MessageBrokerTaskSourceSchema(BaseModel):
14 |     broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
15 | 
16 |     task_queue: str = "morpheus_task_queue"
17 |     raise_on_failure: bool = False
18 | 
19 |     progress_engines: Annotated[int, Field(ge=1)] = 6
20 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_message_wrapper_schema.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class MessageWrapper(BaseModel):
5 |     payload: str
6 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_metadata_injector_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | from pydantic import ConfigDict, BaseModel
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class MetadataInjectorSchema(BaseModel):
14 |     raise_on_failure: bool = False
15 |     model_config = ConfigDict(extra="forbid")
16 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_otel_meter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import ConfigDict, BaseModel
 7 | 
 8 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
 9 | 
10 | 
11 | class OpenTelemetryMeterSchema(BaseModel):
12 |     broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema()
13 | 
14 |     otel_endpoint: str = "localhost:4317"
15 |     raise_on_failure: bool = False
16 |     model_config = ConfigDict(extra="forbid")
17 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_otel_tracer_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | from pydantic import ConfigDict, BaseModel
 7 | 
 8 | 
 9 | class OpenTelemetryTracerSchema(BaseModel):
10 |     otel_endpoint: str = "localhost:4317"
11 |     raise_on_failure: bool = False
12 |     model_config = ConfigDict(extra="forbid")
13 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_processing_job_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from pydantic import BaseModel, ConfigDict
 6 | from enum import Enum
 7 | 
 8 | 
 9 | class ConversionStatus(str, Enum):
10 |     IN_PROGRESS = "in_progress"
11 |     SUCCESS = "success"
12 |     FAILED = "failed"
13 | 
14 |     model_config = ConfigDict(extra="forbid")
15 | 
16 | 
17 | class ProcessingJob(BaseModel):
18 |     submitted_job_id: str
19 |     filename: str
20 |     raw_result: str = ""
21 |     content: str = ""
22 |     status: ConversionStatus
23 |     error: str | None = None
24 | 
25 |     model_config = ConfigDict(extra="forbid")
26 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/schemas/framework_task_injection_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | from pydantic import ConfigDict, BaseModel
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class TaskInjectionSchema(BaseModel):
14 |     raise_on_failure: bool = False
15 |     model_config = ConfigDict(extra="forbid")
16 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/flow_control/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | 
6 | from .filter_by_task import filter_by_task
7 | 
8 | __all__ = ["filter_by_task"]
9 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/service/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/service/impl/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/service/impl/ingest/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/service/meta/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/service/meta/ingest/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from abc import ABC
 6 | from abc import abstractmethod
 7 | from typing import List, Optional
 8 | 
 9 | from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
10 | from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob
11 | from nv_ingest_api.util.service_clients.client_base import FetchMode
12 | 
13 | 
14 | class IngestServiceMeta(ABC):
15 |     @abstractmethod
16 |     async def submit_job(self, job_spec: MessageWrapper, trace_id: str) -> str:
17 |         """Abstract method for submitting one or more jobs to the ingestion pipeline"""
18 | 
19 |     @abstractmethod
20 |     async def fetch_job(self, job_id: str):
21 |         """Abstract method for fetching job from ingestion service based on job_id"""
22 | 
23 |     @abstractmethod
24 |     async def set_processing_cache(self, job_id: str, jobs_data: List[ProcessingJob]) -> None:
25 |         """Abstract method for setting processing cache"""
26 | 
27 |     @abstractmethod
28 |     async def get_processing_cache(self, job_id: str) -> List[ProcessingJob]:
29 |         """Abstract method for getting processing cache"""
30 | 
31 |     @abstractmethod
32 |     async def set_job_state(self, job_id: str, state: str, ttl: int = 86400):
33 |         """Abstract method for setting job state"""
34 | 
35 |     @abstractmethod
36 |     async def get_job_state(self, job_id: str) -> Optional[str]:
37 |         """Abstract method for getting job state"""
38 | 
39 |     @abstractmethod
40 |     async def get_fetch_mode(self) -> FetchMode:
41 |         """Abstract method for getting fetch mode"""
42 | 


--------------------------------------------------------------------------------
/src/nv_ingest/framework/util/telemetry/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/src/util/image_model_validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/util/image_model_validation/__init__.py


--------------------------------------------------------------------------------
/src/util/image_model_validation/deplot.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import logging
 6 | 
 7 | import click
 8 | from util import display_image
 9 | from util import initialize_triton_client
10 | from util import load_and_preprocess_image
11 | from util import perform_inference
12 | from util import prepare_input_tensor
13 | from util import prepare_output_tensor
14 | from util import print_output_results
15 | from util import validate_output
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | @click.command()
21 | @click.argument("image_path", type=click.Path(exists=True))
22 | @click.option("--display", is_flag=True, help="Display the image before sending it for inference.")
23 | def main(image_path, display):
24 |     # Configuration
25 |     url = "localhost:8004"
26 |     model_name = "deplot"
27 |     batch_size = 1
28 |     target_img_size = (1024, 1024)
29 | 
30 |     # Workflow
31 |     triton_client = initialize_triton_client(url)
32 |     resized_image, input_data = load_and_preprocess_image(image_path, target_img_size)
33 | 
34 |     if display:
35 |         display_image(resized_image)
36 | 
37 |     input_dims = input_data.shape[1:]  # Exclude batch dimension
38 |     logger.info(f"Detected input dimensions: {input_dims}")
39 | 
40 |     inputs = prepare_input_tensor(input_data)
41 |     outputs = prepare_output_tensor()
42 | 
43 |     results = perform_inference(triton_client, model_name, inputs, outputs)
44 |     output_data = results.as_numpy("output")
45 | 
46 |     validate_output(output_data, batch_size)
47 |     print_output_results(output_data)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/src/util/image_model_validation/paddle.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import click
 6 | import numpy as np
 7 | from util import display_image
 8 | from util import initialize_triton_client
 9 | from util import load_and_preprocess_image
10 | from util import perform_inference
11 | from util import prepare_input_tensor
12 | from util import prepare_output_tensor
13 | from util import print_output_results
14 | from util import validate_output
15 | 
16 | 
17 | @click.command()
18 | @click.argument("image_path", type=click.Path(exists=True))
19 | @click.option("--display", is_flag=True, help="Display the image before sending it for inference.")
20 | def main(image_path, display):
21 |     # Triton server URL and Model details
22 |     url = "localhost:8010"
23 |     model_name = "paddle"
24 |     batch_size = 1
25 |     target_img_size = (1024, 1024)
26 | 
27 |     # Load and preprocess image
28 |     resized_image, input_data = load_and_preprocess_image(image_path, target_img_size)
29 |     resized_images = np.expand_dims(resized_image, axis=0)  # Add batch dimension
30 | 
31 |     # Optionally display the image
32 |     if display:
33 |         display_image(resized_image)
34 | 
35 |     # Detect input dimensions from the loaded image
36 |     input_dims = input_data.shape[1:]  # Exclude the batch dimension
37 |     print(f"Detected input dimensions: {input_dims}")
38 | 
39 |     # Initialize Triton gRPC client
40 |     triton_client = initialize_triton_client(url)
41 | 
42 |     # Prepare input and output tensors
43 |     inputs = prepare_input_tensor(resized_images)
44 |     outputs = prepare_output_tensor()
45 | 
46 |     # Call the Triton server for inference
47 |     results = perform_inference(triton_client, model_name, inputs, outputs)
48 | 
49 |     # Get output data
50 |     output_data = results.as_numpy("output")
51 | 
52 |     # Validate output size
53 |     validate_output(output_data, batch_size)
54 | 
55 |     # Print the output results
56 |     print_output_results(output_data)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/functional/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/functional/test_ingest_pipeline.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # redis config
 6 | _DEFAULT_REDIS_HOST = "redis"
 7 | _DEFAULT_REDIS_PORT = 6379
 8 | 
 9 | # job config
10 | _DEFAULT_TASK_QUEUE = "morpheus_task_queue"
11 | _DEFAULT_JOB_TIMEOUT = 90
12 | 
13 | # extract_config
14 | _DEFAULT_EXTRACT_PAGE_DEPTH = "document"
15 | _DEFAULT_EXTRACT_TABLES_METHOD = "yolox"
16 | 
17 | # split config
18 | _DEFAULT_SPLIT_BY = "word"
19 | _DEFAULT_SPLIT_LENGTH = 300
20 | _DEFAULT_SPLIT_OVERLAP = 10
21 | _DEFAULT_SPLIT_MAX_CHARACTER_LENGTH = 5000
22 | _DEFAULT_SPLIT_SENTENCE_WINDOW_SIZE = 0
23 | 
24 | # file config
25 | _VALIDATION_PDF = "data/functional_validation.pdf"
26 | _VALIDATION_JSON = "data/functional_validation.json"
27 | 
28 | 
29 | def remove_keys(data, keys_to_remove):
30 |     if isinstance(data, dict):
31 |         return {k: remove_keys(v, keys_to_remove) for k, v in data.items() if k not in keys_to_remove}
32 |     elif isinstance(data, list):
33 |         return [remove_keys(item, keys_to_remove) for item in data]
34 |     else:
35 |         return data
36 | 


--------------------------------------------------------------------------------
/tests/import_checks.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | def check_morpheus_import():
 7 |     try:
 8 |         import morpheus
 9 | 
10 |         _ = morpheus._version
11 | 
12 |         return True
13 |     except Exception as e:
14 |         print(f"\nError: {e}\n", flush=True)
15 |         return False
16 | 
17 | 
18 | def check_cuda_driver():
19 |     try:
20 |         import cupy
21 | 
22 |         import cudf
23 | 
24 |         _ = cupy.cuda.runtime.driverGetVersion()
25 |         _ = cudf.DataFrame({"a": [1, 2, 3]})
26 |         return True
27 |     except Exception as e:
28 |         print(f"\nError: {e}\n", flush=True)
29 |         return False
30 | 
31 | 
32 | def check_adobe_import():
33 |     try:
34 |         pass
35 | 
36 |         return True
37 |     except ImportError:
38 |         return False
39 | 
40 | 
41 | ADOBE_IMPORT_OK = check_adobe_import()
42 | CUDA_DRIVER_OK = check_cuda_driver()
43 | MORPHEUS_IMPORT_OK = check_morpheus_import()
44 | 


--------------------------------------------------------------------------------
/tests/integration/test_examples.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.mark.integration
 8 | def test_launch_libmode_and_run_ingestor():
 9 |     process = subprocess.run(
10 |         [sys.executable, "./examples/launch_libmode_and_run_ingestor.py"], capture_output=True, text=True
11 |     )
12 | 
13 |     try:
14 |         assert process.returncode == 0
15 |         # pdfium text
16 |         assert "A sample document with headings and placeholder text" in process.stdout
17 |     except:
18 |         print(process.stdout)
19 |         print(process.stderr)
20 |         raise
21 | 


--------------------------------------------------------------------------------
/tests/integration/test_extract_audio.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import time
 4 | 
 5 | import pytest
 6 | from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
 7 | from nv_ingest_client.client import Ingestor
 8 | from nv_ingest_client.client import NvIngestClient
 9 | 
10 | 
11 | @pytest.mark.integration
12 | def test_audio_extract_only(
13 |     pipeline_process,
14 | ):
15 |     client = NvIngestClient(
16 |         message_client_allocator=SimpleClient,
17 |         message_client_port=7671,
18 |         message_client_hostname="localhost",
19 |     )
20 | 
21 |     ingestor = Ingestor(client=client).files("./data/multimodal_test.wav").extract()
22 | 
23 |     results = ingestor.ingest()
24 |     assert len(results) == 1
25 | 
26 |     transcript = results[0][0]["metadata"]["audio_metadata"]["audio_transcript"]
27 |     expected = (
28 |         "Section one, this is the first section of the document. "
29 |         "It has some more placeholder text to show how the document looks like. "
30 |         "The text is not meant to be meaningful or informative, "
31 |         "but rather to demonstrate the layout and formatting of the document."
32 |     )
33 |     assert transcript == expected
34 | 


--------------------------------------------------------------------------------
/tests/service_tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/injectors/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/injectors/test_task_injector.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from unittest.mock import MagicMock
 6 | 
 7 | import pytest
 8 | 
 9 | try:
10 |     from nv_ingest.framework.orchestration.morpheus.modules.injectors.task_injection import on_data
11 | 
12 |     morpheus_import = True
13 | except:
14 |     morpheus_import = False
15 | 
16 | 
17 | @pytest.fixture
18 | def mock_message():
19 |     """Fixture to create and return a mock IngestControlMessage object."""
20 |     return MagicMock()
21 | 
22 | 
23 | @pytest.mark.skipif(not morpheus_import, reason="Morpheus modules are not available")
24 | def test_on_data_returns_message(mock_message):
25 |     """Test that on_data returns the same IngestControlMessage object it receives."""
26 |     result = on_data(mock_message)
27 |     assert result is mock_message, "on_data should return the input IngestControlMessage object."
28 | 
29 | 
30 | @pytest.mark.skipif(not morpheus_import, reason="Morpheus modules are not available")
31 | def test_on_data_calls_get_metadata_with_correct_arguments(mock_message):
32 |     """Test that on_data calls get_metadata on the IngestControlMessage object with correct arguments."""
33 |     on_data(mock_message)
34 |     mock_message.get_metadata.assert_called_once_with("task_meta")
35 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/sinks/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/sources/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/storages/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/storages/test_image_storage.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pandas as pd
 6 | import pytest
 7 | from minio import Minio
 8 | 
 9 | from nv_ingest_api.internal.enums.common import ContentTypeEnum
10 | from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
11 | 
12 | 
13 | class MockMinioClient:
14 |     def __init__(self, *args, **kwargs):
15 |         pass
16 | 
17 |     def make_bucket(self, *args, **kwargs):
18 |         return
19 | 
20 |     def put_object(self, *args, **kwargs):
21 |         return
22 | 
23 |     def bucket_exists(self, *args, **kwargs):
24 |         return True
25 | 
26 | 
27 | @pytest.fixture
28 | def mock_minio(mocker):
29 |     def mock_minio_init(
30 |         cls,
31 |         *args,
32 |         **kwargs,
33 |     ):
34 |         return MockMinioClient(*args, **kwargs)
35 | 
36 |     patched = mocker.patch.object(Minio, "__new__", new=mock_minio_init)
37 |     yield patched
38 | 


--------------------------------------------------------------------------------
/tests/service_tests/modules/telemetry/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_image_dedup_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import pytest
 7 | from pydantic import ValidationError
 8 | 
 9 | from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
10 | 
11 | 
12 | def valid_module_config():
13 |     """Returns a valid job payload for testing purposes."""
14 |     return {
15 |         "raise_on_failure": True,
16 |     }
17 | 
18 | 
19 | def test_task_type_str_bool():
20 |     img_dedup_module_config = valid_module_config()
21 |     img_dedup_module_config["raise_on_failure"] = bool(img_dedup_module_config["raise_on_failure"])
22 |     _ = ImageDedupSchema(**img_dedup_module_config)
23 | 
24 | 
25 | @pytest.mark.parametrize("dtype", [int, float, str])
26 | def test_task_type_str_bool_sensitivity(dtype):
27 |     img_dedup_module_config = valid_module_config()
28 |     img_dedup_module_config["raise_on_failure"] = dtype(img_dedup_module_config["raise_on_failure"])
29 | 
30 |     with pytest.raises(ValidationError):
31 |         _ = ImageDedupSchema(**img_dedup_module_config)
32 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_image_filter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | 
 6 | import pytest
 7 | from pydantic import ValidationError
 8 | 
 9 | from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
10 | 
11 | 
12 | def valid_module_config():
13 |     """Returns a valid job payload for testing purposes."""
14 |     return {
15 |         "raise_on_failure": True,
16 |         "cpu_only": True,
17 |     }
18 | 
19 | 
20 | def test_task_type_str_bool():
21 |     img_filter_module_config = valid_module_config()
22 |     img_filter_module_config["raise_on_failure"] = bool(img_filter_module_config["raise_on_failure"])
23 |     img_filter_module_config["cpu_only"] = bool(img_filter_module_config["cpu_only"])
24 |     _ = ImageFilterSchema(**img_filter_module_config)
25 | 
26 | 
27 | @pytest.mark.parametrize("dtype", [int, float, str])
28 | def test_task_type_str_bool_sensitivity(dtype):
29 |     img_filter_module_config = valid_module_config()
30 |     img_filter_module_config["raise_on_failure"] = dtype(img_filter_module_config["raise_on_failure"])
31 |     img_filter_module_config["cpu_only"] = dtype(img_filter_module_config["cpu_only"])
32 | 
33 |     with pytest.raises(ValidationError):
34 |         _ = ImageFilterSchema(**img_filter_module_config)
35 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_injection_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pytest
 6 | from pydantic import ValidationError
 7 | 
 8 | from nv_ingest.framework.schemas.framework_task_injection_schema import TaskInjectionSchema
 9 | 
10 | 
11 | def test_task_injection_schema_default():
12 |     """
13 |     Test TaskInjectionSchema with default values.
14 |     """
15 |     schema = TaskInjectionSchema()
16 |     assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False."
17 | 
18 | 
19 | def test_task_injection_schema_explicit_value():
20 |     """
21 |     Test TaskInjectionSchema with an explicit value for raise_on_failure.
22 |     """
23 |     schema = TaskInjectionSchema(raise_on_failure=True)
24 |     assert schema.raise_on_failure is True, "raise_on_failure should respect the explicitly provided value."
25 | 
26 | 
27 | def test_task_injection_schema_forbids_extra():
28 |     """
29 |     Test that TaskInjectionSchema forbids extra fields due to the 'extra = "forbid"' configuration.
30 |     """
31 |     with pytest.raises(ValidationError) as excinfo:
32 |         TaskInjectionSchema(raise_on_failure=False, unexpected_field="value")
33 |     assert "Extra inputs are not permitted" in str(excinfo.value), "Schema should not allow extra fields."
34 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_job_counter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest.framework.schemas.framework_job_counter_schema import JobCounterSchema
 6 | 
 7 | 
 8 | def test_job_counter_schema_defaults():
 9 |     schema = JobCounterSchema()
10 |     assert schema.name == "job_counter", "Default value for name should be 'job_counter'."
11 |     assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False."
12 | 
13 | 
14 | def test_job_counter_schema_custom_values():
15 |     schema = JobCounterSchema(name="foo", raise_on_failure=True)
16 | 
17 |     assert schema.name == "foo", "Custom value for name should be respected."
18 |     assert schema.raise_on_failure is True, "Custom value for raise_on_failure should be respected."
19 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_metadata_injector_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | import pytest
 6 | from pydantic import ValidationError
 7 | 
 8 | from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
 9 | 
10 | 
11 | def test_metadata_injector_schema_default():
12 |     """
13 |     Test the MetadataInjectorSchema with default values.
14 |     """
15 |     schema = MetadataInjectorSchema()
16 |     assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False."
17 | 
18 | 
19 | def test_metadata_injector_schema_explicit_value():
20 |     """
21 |     Test the MetadataInjectorSchema with an explicit value for raise_on_failure.
22 |     """
23 |     schema = MetadataInjectorSchema(raise_on_failure=True)
24 |     assert schema.raise_on_failure is True, "raise_on_failure should respect the explicitly provided value."
25 | 
26 | 
27 | def test_metadata_injector_schema_forbids_extra():
28 |     """
29 |     Test that the MetadataInjectorSchema forbids extra fields due to the 'extra = "forbid"' configuration.
30 |     """
31 |     with pytest.raises(ValidationError) as excinfo:
32 |         MetadataInjectorSchema(raise_on_failure=False, unexpected_field="value")
33 |     assert "Extra inputs are not permitted" in str(excinfo.value), "Schema should not allow extra fields."
34 | 
35 | 
36 | @pytest.mark.parametrize("input_value", [True, False])
37 | def test_metadata_injector_schema_raise_on_failure_parametrized(input_value):
38 |     """
39 |     Parametrized test for different boolean values of raise_on_failure.
40 |     """
41 |     schema = MetadataInjectorSchema(raise_on_failure=input_value)
42 |     assert schema.raise_on_failure is input_value, f"raise_on_failure should be {input_value}."
43 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_otel_meter_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest.framework.schemas.framework_otel_meter_schema import OpenTelemetryMeterSchema
 6 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema
 7 | 
 8 | 
 9 | def test_otel_meter_schema_defaults():
10 |     schema = OpenTelemetryMeterSchema()
11 |     assert isinstance(
12 |         schema.broker_client, MessageBrokerClientSchema
13 |     ), "broker_client should be an instance of MessageBrokerClientSchema."
14 |     assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False."
15 | 
16 | 
17 | def test_otel_meter_schema_custom_values():
18 |     custom_redis_client = MessageBrokerClientSchema(host="custom_host", port=12345, broker_params={"use_ssl": True})
19 |     schema = OpenTelemetryMeterSchema(broker_client=custom_redis_client, raise_on_failure=True)
20 | 
21 |     assert schema.broker_client.host == "custom_host", "Custom host value for redis_client should be respected."
22 |     assert schema.broker_client.port == 12345, "Custom port value for redis_client should be respected."
23 |     assert (
24 |         schema.broker_client.broker_params["use_ssl"] is True
25 |     ), "Custom use_ssl value for broker_client should be True."
26 |     assert schema.raise_on_failure is True, "Custom value for raise_on_failure should be respected."
27 | 


--------------------------------------------------------------------------------
/tests/service_tests/schemas/test_otel_tracer_schema.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
 6 | 
 7 | 
 8 | def test_otel_tracer_schema_defaults():
 9 |     schema = OpenTelemetryTracerSchema()
10 |     assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False."
11 | 
12 | 
13 | def test_otel_tracer_schema_custom_values():
14 |     schema = OpenTelemetryTracerSchema(raise_on_failure=True)
15 |     assert schema.raise_on_failure is True, "Custom value for raise_on_failure should be respected."
16 | 


--------------------------------------------------------------------------------
/tests/service_tests/stages/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/util/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/util/flow_control/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/util/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/service_tests/util/telemetry/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------