├── .devcontainer ├── Dockerfile ├── README.md └── devcontainer.json ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report_form.yml │ ├── config.yml │ ├── documentation_request_correction.yml │ ├── documentation_request_new.yml │ └── feature_request_form.yml ├── PULL_REQUEST_TEMPLATE.md ├── copy-pr-bot.yaml └── workflows │ ├── build-docs.yml │ ├── conda-publish.yml │ ├── docker-build.yml │ ├── docker-nightly-publish.yml │ ├── docker-release-publish.yml │ ├── pre-commit.yml │ ├── pypi-nightly-publish.yml │ └── test-library-mode.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CITATION.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── api ├── LICENSE ├── MANIFEST.in ├── README.md ├── api_tests │ ├── __init__.py │ ├── import_checks.py │ ├── interface │ │ ├── __init__.py │ │ ├── test_extract.py │ │ ├── test_interface.py │ │ ├── test_mutate.py │ │ ├── test_transform.py │ │ └── test_utility.py │ ├── internal │ │ ├── __init__.py │ │ ├── extract │ │ │ ├── __init__.py │ │ │ ├── audio │ │ │ │ ├── __init__.py │ │ │ │ └── test_audio_extraction.py │ │ │ ├── docx │ │ │ │ ├── __init__.py │ │ │ │ └── test_docx_extractor.py │ │ │ └── image │ │ │ │ ├── __init__.py │ │ │ │ ├── test_chart_extractor.py │ │ │ │ ├── test_image_extractor.py │ │ │ │ ├── test_infographic_extractor.py │ │ │ │ └── test_table_extractor.py │ │ ├── mutate │ │ │ ├── __init__.py │ │ │ ├── test_deduplicate_images.py │ │ │ └── test_filter_images.py │ │ └── test_enums.py │ ├── primitives │ │ ├── __init__.py │ │ ├── nim │ │ │ ├── __init__.py │ │ │ └── model_interface │ │ │ │ ├── __init__.py │ │ │ │ ├── test_decorators.py │ │ │ │ ├── test_helpers.py │ │ │ │ ├── test_nemoretriever_parse.py │ │ │ │ ├── test_paddle.py │ │ │ │ ├── test_parakeet.py │ │ │ │ ├── test_text_embedding.py │ │ │ │ ├── test_vlm.py │ │ │ │ ├── test_yolox_interface_base.py │ │ │ │ ├── test_yolox_interface_graphic_elements.py │ │ │ │ ├── test_yolox_interface_page_elements.py │ │ │ │ ├── test_yolox_interface_table_structure.py │ │ │ │ └── test_yolox_utilities.py │ │ ├── test_ingest_control_message.py │ │ ├── test_ingest_control_message_task.py │ │ └── tracing │ │ │ ├── __init__.py │ │ │ ├── test_latency.py │ │ │ └── test_tagging.py │ ├── smoke_test.sh │ ├── util │ │ ├── __init__.py │ │ ├── converters │ │ │ ├── __init__.py │ │ │ ├── multimodal_test_raw_results.json │ │ │ ├── test_bytetools.py │ │ │ ├── test_containers.py │ │ │ ├── test_datetools.py │ │ │ ├── test_formats.py │ │ │ └── test_type_mappings.py │ │ ├── detectors │ │ │ ├── __init__.py │ │ │ └── test_language.py │ │ ├── exception_handlers │ │ │ ├── __init__.py │ │ │ ├── test_converters.py │ │ │ ├── test_decorators.py │ │ │ ├── test_detectors.py │ │ │ ├── test_pdf.py │ │ │ └── test_schemas.py │ │ ├── image_processing │ │ │ ├── __init__.py │ │ │ ├── test_clustering.py │ │ │ └── test_transforms.py │ │ ├── logging │ │ │ ├── __init__.py │ │ │ └── test_configuration.py │ │ ├── message_brokers │ │ │ ├── __init__.py │ │ │ ├── redis │ │ │ │ ├── __init__.py │ │ │ │ └── test_redis_client.py │ │ │ └── simple_message_broker │ │ │ │ ├── __init__.py │ │ │ │ ├── test_ordered_message_queue.py │ │ │ │ ├── test_simple_client.py │ │ │ │ └── test_simple_message_broker.py │ │ ├── metadata │ │ │ ├── __init__.py │ │ │ └── test_metadata_aggregators.py │ │ └── schema │ │ │ ├── __init__.py │ │ │ └── test_schema_validator.py │ └── utilities_for_test.py ├── pyproject.toml └── src │ ├── nv_ingest_api │ ├── __init__.py │ ├── interface │ │ ├── __init__.py │ │ ├── extract.py │ │ ├── mutate.py │ │ ├── store.py │ │ ├── transform.py │ │ └── utility.py │ ├── internal │ │ ├── __init__.py │ │ ├── enums │ │ │ ├── __init__.py │ │ │ └── common.py │ │ ├── extract │ │ │ ├── __init__.py │ │ │ ├── audio │ │ │ │ ├── __init__.py │ │ │ │ └── audio_extraction.py │ │ │ ├── docx │ │ │ │ ├── __init__.py │ │ │ │ ├── docx_extractor.py │ │ │ │ └── engines │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── docxreader_helpers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── docx_helper.py │ │ │ │ │ └── docxreader.py │ │ │ ├── image │ │ │ │ ├── __init__.py │ │ │ │ ├── chart_extractor.py │ │ │ │ ├── image_extractor.py │ │ │ │ ├── image_helpers │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── common.py │ │ │ │ ├── infographic_extractor.py │ │ │ │ └── table_extractor.py │ │ │ ├── pdf │ │ │ │ ├── __init__.py │ │ │ │ ├── engines │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── adobe.py │ │ │ │ │ ├── llama.py │ │ │ │ │ ├── nemoretriever.py │ │ │ │ │ ├── pdf_helpers │ │ │ │ │ │ └── __init__.py │ │ │ │ │ ├── pdfium.py │ │ │ │ │ ├── tika.py │ │ │ │ │ └── unstructured_io.py │ │ │ │ └── pdf_extractor.py │ │ │ └── pptx │ │ │ │ ├── __init__.py │ │ │ │ ├── engines │ │ │ │ ├── __init__.py │ │ │ │ └── pptx_helper.py │ │ │ │ └── pptx_extractor.py │ │ ├── mutate │ │ │ ├── __init__.py │ │ │ ├── deduplicate.py │ │ │ └── filter.py │ │ ├── primitives │ │ │ ├── __init__.py │ │ │ ├── control_message_task.py │ │ │ ├── ingest_control_message.py │ │ │ ├── nim │ │ │ │ ├── __init__.py │ │ │ │ ├── default_values.py │ │ │ │ ├── model_interface │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cached.py │ │ │ │ │ ├── decorators.py │ │ │ │ │ ├── deplot.py │ │ │ │ │ ├── helpers.py │ │ │ │ │ ├── nemoretriever_parse.py │ │ │ │ │ ├── paddle.py │ │ │ │ │ ├── parakeet.py │ │ │ │ │ ├── text_embedding.py │ │ │ │ │ ├── vlm.py │ │ │ │ │ └── yolox.py │ │ │ │ ├── nim_client.py │ │ │ │ └── nim_model_interface.py │ │ │ └── tracing │ │ │ │ ├── __init__.py │ │ │ │ ├── latency.py │ │ │ │ ├── logging.py │ │ │ │ └── tagging.py │ │ ├── schemas │ │ │ ├── __init__.py │ │ │ ├── extract │ │ │ │ ├── __init__.py │ │ │ │ ├── extract_audio_schema.py │ │ │ │ ├── extract_chart_schema.py │ │ │ │ ├── extract_docx_schema.py │ │ │ │ ├── extract_image_schema.py │ │ │ │ ├── extract_infographic_schema.py │ │ │ │ ├── extract_pdf_schema.py │ │ │ │ ├── extract_pptx_schema.py │ │ │ │ └── extract_table_schema.py │ │ │ ├── message_brokers │ │ │ │ ├── __init__.py │ │ │ │ ├── message_broker_client_schema.py │ │ │ │ ├── request_schema.py │ │ │ │ └── response_schema.py │ │ │ ├── meta │ │ │ │ ├── __init__.py │ │ │ │ ├── base_model_noext.py │ │ │ │ ├── ingest_job_schema.py │ │ │ │ └── metadata_schema.py │ │ │ ├── mutate │ │ │ │ ├── __init__.py │ │ │ │ └── mutate_image_dedup_schema.py │ │ │ ├── store │ │ │ │ ├── __init__.py │ │ │ │ ├── store_embedding_schema.py │ │ │ │ └── store_image_schema.py │ │ │ └── transform │ │ │ │ ├── __init__.py │ │ │ │ ├── transform_image_caption_schema.py │ │ │ │ ├── transform_image_filter_schema.py │ │ │ │ ├── transform_text_embedding_schema.py │ │ │ │ └── transform_text_splitter_schema.py │ │ ├── store │ │ │ ├── __init__.py │ │ │ ├── embed_text_upload.py │ │ │ └── image_upload.py │ │ └── transform │ │ │ ├── __init__.py │ │ │ ├── caption_image.py │ │ │ ├── embed_text.py │ │ │ └── split_text.py │ └── util │ │ ├── __init__.py │ │ ├── control_message │ │ ├── __init__.py │ │ └── validators.py │ │ ├── converters │ │ ├── __init__.py │ │ ├── bytetools.py │ │ ├── containers.py │ │ ├── datetools.py │ │ ├── dftools.py │ │ ├── formats.py │ │ └── type_mappings.py │ │ ├── detectors │ │ ├── __init__.py │ │ └── language.py │ │ ├── exception_handlers │ │ ├── __init__.py │ │ ├── converters.py │ │ ├── decorators.py │ │ ├── detectors.py │ │ ├── pdf.py │ │ └── schemas.py │ │ ├── image_processing │ │ ├── __init__.py │ │ ├── clustering.py │ │ ├── processing.py │ │ ├── table_and_chart.py │ │ └── transforms.py │ │ ├── logging │ │ ├── __init__.py │ │ └── configuration.py │ │ ├── message_brokers │ │ ├── __init__.py │ │ └── simple_message_broker │ │ │ ├── __init__.py │ │ │ ├── broker.py │ │ │ ├── ordered_message_queue.py │ │ │ └── simple_client.py │ │ ├── metadata │ │ ├── __init__.py │ │ └── aggregators.py │ │ ├── multi_processing │ │ ├── __init__.py │ │ └── mp_pool_singleton.py │ │ ├── nim │ │ └── __init__.py │ │ ├── pdf │ │ ├── __init__.py │ │ └── pdfium.py │ │ ├── schema │ │ ├── __init__.py │ │ └── schema_validator.py │ │ ├── service_clients │ │ ├── __init__.py │ │ ├── client_base.py │ │ ├── kafka │ │ │ └── __init__.py │ │ ├── redis │ │ │ ├── __init__.py │ │ │ └── redis_client.py │ │ └── rest │ │ │ ├── __init__.py │ │ │ └── rest_client.py │ │ └── string_processing │ │ └── __init__.py │ └── version.py ├── ci ├── data │ ├── pdf_20_chart_bbox.csv │ ├── pdf_20_chart_text_output.csv │ ├── pdf_20_table_bbox.csv │ └── pdf_20_table_paddleOCR.csv └── scripts │ ├── bo20_validate.py │ └── build_pip_packages.sh ├── client ├── LICENSE ├── MANIFEST.in ├── README.md ├── client_examples │ ├── README.md │ ├── docker │ │ ├── Dockerfile.client │ │ ├── entrypoint.sh │ │ └── start-jupyter.sh │ └── examples │ │ ├── cli_client_usage.ipynb │ │ └── python_client_usage.ipynb ├── client_tests │ ├── __init__.py │ ├── cli │ │ ├── __init__.py │ │ ├── test_nv_ingest_cli.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── test_click.py │ │ │ ├── test_processing.py │ │ │ └── test_system.py │ ├── client │ │ ├── __init__.py │ │ ├── test_client.py │ │ ├── test_interface.py │ │ └── test_rest_client.py │ ├── primitives │ │ ├── __init__.py │ │ ├── jobs │ │ │ ├── __init__.py │ │ │ ├── test_job_spec.py │ │ │ └── test_job_state.py │ │ └── tasks │ │ │ ├── __init__.py │ │ │ ├── test_audio_extraction.py │ │ │ ├── test_caption.py │ │ │ ├── test_dedup.py │ │ │ ├── test_embed.py │ │ │ ├── test_extract.py │ │ │ ├── test_filter.py │ │ │ ├── test_split.py │ │ │ ├── test_store.py │ │ │ ├── test_store_embed.py │ │ │ ├── test_task_base.py │ │ │ └── test_task_factory.py │ └── util │ │ ├── file_processing │ │ ├── __init__.py │ │ └── test_extract.py │ │ ├── test_dataset.py │ │ ├── test_milvus_util.py │ │ └── test_util.py ├── pyproject.toml └── src │ ├── nv_ingest_client │ ├── __init__.py │ ├── cli │ │ ├── __init__.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── click.py │ │ │ ├── processing.py │ │ │ ├── system.py │ │ │ └── tasks.py │ ├── client │ │ ├── __init__.py │ │ ├── client.py │ │ └── interface.py │ ├── nv_ingest_cli.py │ ├── primitives │ │ ├── __init__.py │ │ ├── exceptions.py │ │ ├── jobs │ │ │ ├── __init__.py │ │ │ ├── job_spec.py │ │ │ └── job_state.py │ │ └── tasks │ │ │ ├── __init__.py │ │ │ ├── audio_extraction.py │ │ │ ├── caption.py │ │ │ ├── chart_extraction.py │ │ │ ├── dedup.py │ │ │ ├── embed.py │ │ │ ├── extract.py │ │ │ ├── filter.py │ │ │ ├── infographic_extraction.py │ │ │ ├── split.py │ │ │ ├── store.py │ │ │ ├── table_extraction.py │ │ │ ├── task_base.py │ │ │ ├── task_factory.py │ │ │ ├── transform.py │ │ │ └── vdb_upload.py │ └── util │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── file_processing │ │ ├── __init__.py │ │ └── extract.py │ │ ├── milvus.py │ │ ├── process_json_files.py │ │ ├── processing.py │ │ ├── util.py │ │ └── zipkin.py │ └── version.py ├── conda ├── build_conda_packages.sh ├── environments │ ├── nv_ingest_api_environment.yml │ ├── nv_ingest_client_environment.yml │ └── nv_ingest_environment.yml ├── packages │ ├── nv_ingest │ │ └── meta.yaml │ ├── nv_ingest_api │ │ └── meta.yaml │ └── nv_ingest_client │ │ └── meta.yaml └── scripts │ └── helper_functions.sh ├── config ├── otel-collector-config.yaml └── prometheus.yaml ├── data ├── chart.png ├── charts_with_page_num_fixed.csv ├── embedded_table.pdf ├── functional_validation.json ├── functional_validation.pdf ├── multimodal_test.bmp ├── multimodal_test.docx ├── multimodal_test.jpeg ├── multimodal_test.json ├── multimodal_test.pdf ├── multimodal_test.png ├── multimodal_test.pptx ├── multimodal_test.svg ├── multimodal_test.tiff ├── multimodal_test.wav ├── table.png ├── table_queries_cleaned_235.csv ├── table_test.pdf ├── test-page-form.pdf ├── test-shapes.pdf ├── test.pdf ├── text_query_answer_gt_page.csv ├── woods_frost.docx └── woods_frost.pdf ├── deploy └── pdf-blueprint.ipynb ├── docker-compose.yaml ├── docker └── scripts │ ├── entrypoint.sh │ ├── entrypoint_devcontainer.sh │ ├── entrypoint_source_ext.sh │ └── post_build_triggers.py ├── docs ├── Dockerfile ├── Makefile ├── docs │ ├── assets │ │ └── css │ │ │ ├── color-schemes.css │ │ │ ├── custom-material.css │ │ │ ├── fonts.css │ │ │ └── jupyter-themes.css │ ├── extraction │ │ ├── audio.md │ │ ├── content-metadata.md │ │ ├── contributing.md │ │ ├── data-store.md │ │ ├── environment-config.md │ │ ├── example_processed_docs │ │ │ └── text │ │ │ │ └── multimodal_test.pdf.metadata.json │ │ ├── faq.md │ │ ├── helm.md │ │ ├── images │ │ │ ├── audio.png │ │ │ ├── generate_personal_key.png │ │ │ ├── image_viewer_example.png │ │ │ ├── overview-extraction.png │ │ │ ├── overview-retriever.png │ │ │ ├── preview-image.png │ │ │ ├── prometheus.png │ │ │ ├── test.pdf.png │ │ │ └── zipkin.png │ │ ├── nemoretriever-parse.md │ │ ├── ngc-api-key.md │ │ ├── notebooks.md │ │ ├── nv-ingest-python-api.md │ │ ├── nv-ingest_cli.md │ │ ├── overview.md │ │ ├── prerequisites.md │ │ ├── quickstart-guide.md │ │ ├── quickstart-library-mode.md │ │ ├── releasenotes-nv-ingest.md │ │ ├── support-matrix.md │ │ └── telemetry.md │ └── overview.md ├── mkdocs.yml ├── overrides │ ├── .icons │ │ └── nvidia │ │ │ └── nvidia-logo.svg │ └── main.html ├── requirements.txt ├── scripts │ └── generate_openapi_docs.py └── sphinx_docs │ └── source │ ├── conf.py │ ├── index.rst │ └── openapi.rst ├── evaluation ├── bo767_recall.ipynb └── digital_corpora_download.ipynb ├── examples ├── langchain_multimodal_rag.ipynb ├── launch_libmode_and_run_ingestor.py ├── launch_libmode_service.py ├── llama_index_multimodal_rag.ipynb ├── metadata_and_filtered_search.ipynb └── reindex_example.ipynb ├── helm ├── .helmignore ├── CHANGELOG.md ├── Chart.lock ├── Chart.yaml ├── LICENSE ├── README.md ├── README.md.gotmpl ├── mig │ └── nv-ingest-mig-config.yaml ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── configmap.yaml │ ├── deployment.yaml │ ├── hpa.yaml │ ├── ingress.yaml │ ├── secrets.yaml │ ├── service.yaml │ └── serviceaccount.yaml ├── time-slicing │ └── time-slicing-config.yaml ├── update_helm_readme.sh └── values.yaml ├── print_env.sh ├── pyproject.toml ├── pytest.ini ├── setup.cfg ├── setup.py ├── skaffold ├── README.md ├── nv-ingest.skaffold.yaml └── sensitive │ └── .gitignore ├── src ├── ingest_pipeline_config.json ├── microservice_entrypoint.py ├── nv_ingest │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── main.py │ │ └── v1 │ │ │ ├── __init__.py │ │ │ ├── health.py │ │ │ └── ingest.py │ └── framework │ │ ├── __init__.py │ │ ├── orchestration │ │ ├── __init__.py │ │ └── morpheus │ │ │ ├── __init__.py │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── injectors │ │ │ │ ├── __init__.py │ │ │ │ ├── metadata_injector.py │ │ │ │ └── task_injection.py │ │ │ ├── sinks │ │ │ │ ├── __init__.py │ │ │ │ ├── message_broker_task_sink.py │ │ │ │ └── vdb_task_sink.py │ │ │ ├── sources │ │ │ │ ├── __init__.py │ │ │ │ └── message_broker_task_source.py │ │ │ ├── storages │ │ │ │ ├── __init__.py │ │ │ │ └── image_storage.py │ │ │ ├── telemetry │ │ │ │ ├── __init__.py │ │ │ │ ├── job_counter.py │ │ │ │ ├── otel_meter.py │ │ │ │ └── otel_tracer.py │ │ │ └── transforms │ │ │ │ ├── __init__.py │ │ │ │ └── text_splitter.py │ │ │ ├── stages │ │ │ ├── __init__.py │ │ │ ├── extractors │ │ │ │ ├── __init__.py │ │ │ │ ├── audio_extraction_stage.py │ │ │ │ ├── chart_extraction_stage.py │ │ │ │ ├── docx_extractor_stage.py │ │ │ │ ├── image_extractor_stage.py │ │ │ │ ├── infographic_extraction_stage.py │ │ │ │ ├── pdf_extractor_stage.py │ │ │ │ ├── pptx_extractor_stage.py │ │ │ │ └── table_extraction_stage.py │ │ │ ├── meta │ │ │ │ ├── __init__.py │ │ │ │ ├── linear_module_source_stage_cpu.py │ │ │ │ └── multiprocessing_stage.py │ │ │ ├── mutate │ │ │ │ ├── __init__.py │ │ │ │ ├── image_dedup.py │ │ │ │ └── image_filter.py │ │ │ ├── store │ │ │ │ ├── __init__.py │ │ │ │ ├── embedding_storage_stage.py │ │ │ │ └── image_storage_stage.py │ │ │ └── transforms │ │ │ │ ├── __init__.py │ │ │ │ ├── embed_text_stage.py │ │ │ │ └── image_caption_extraction.py │ │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── config_validator.py │ │ │ └── pipeline │ │ │ ├── __init__.py │ │ │ ├── logging.py │ │ │ ├── pipeline_builders.py │ │ │ ├── pipeline_runners.py │ │ │ └── stage_builders.py │ │ ├── schemas │ │ ├── __init__.py │ │ ├── framework_ingest_config_schema.py │ │ ├── framework_job_counter_schema.py │ │ ├── framework_message_broker_sink_schema.py │ │ ├── framework_message_broker_source_schema.py │ │ ├── framework_message_wrapper_schema.py │ │ ├── framework_metadata_injector_schema.py │ │ ├── framework_otel_meter_schema.py │ │ ├── framework_otel_tracer_schema.py │ │ ├── framework_processing_job_schema.py │ │ ├── framework_task_injection_schema.py │ │ └── framework_vdb_task_sink_schema.py │ │ └── util │ │ ├── __init__.py │ │ ├── flow_control │ │ ├── __init__.py │ │ └── filter_by_task.py │ │ ├── service │ │ ├── __init__.py │ │ ├── impl │ │ │ ├── __init__.py │ │ │ └── ingest │ │ │ │ ├── __init__.py │ │ │ │ └── redis_ingest_service.py │ │ └── meta │ │ │ ├── __init__.py │ │ │ └── ingest │ │ │ ├── __init__.py │ │ │ └── ingest_service_meta.py │ │ └── telemetry │ │ ├── __init__.py │ │ └── global_stats.py └── util │ ├── centroid_testing.py │ ├── gen_dataset.py │ ├── image_model_validation │ ├── __init__.py │ ├── cached.py │ ├── deplot.py │ ├── paddle.py │ └── util.py │ ├── image_viewer.py │ ├── mp_pool_test.py │ ├── ray_pool_test.py │ └── trt_converters.py └── tests ├── __init__.py ├── functional ├── __init__.py └── test_ingest_pipeline.py ├── import_checks.py ├── integration ├── conftest.py ├── test_examples.py ├── test_extract_audio.py ├── test_extract_docx.py ├── test_extract_images.py ├── test_extract_pdf.py ├── test_extract_pptx.py └── utilities_for_test.py ├── service_tests ├── __init__.py ├── modules │ ├── __init__.py │ ├── injectors │ │ ├── __init__.py │ │ ├── test_metadata_injection.py │ │ └── test_task_injector.py │ ├── sinks │ │ └── __init__.py │ ├── sources │ │ ├── __init__.py │ │ └── test_message_broker_task_source.py │ ├── storages │ │ ├── __init__.py │ │ └── test_image_storage.py │ └── telemetry │ │ ├── __init__.py │ │ └── test_otel_tracer.py ├── schemas │ ├── __init__.py │ ├── test_audio_extractor_schema.py │ ├── test_chart_extractor_schema.py │ ├── test_image_caption_extraction_schema.py │ ├── test_image_dedup_schema.py │ ├── test_image_extrator_schema.py │ ├── test_image_filter_schema.py │ ├── test_ingest_metadata.py │ ├── test_injection_schema.py │ ├── test_job_counter_schema.py │ ├── test_metadata_injector_schema.py │ ├── test_metadata_schema.py │ ├── test_otel_meter_schema.py │ ├── test_otel_tracer_schema.py │ ├── test_redis_client_schema.py │ ├── test_redis_task_sink_schema.py │ ├── test_redis_task_source_schema.py │ ├── test_table_extractor_schema.py │ └── test_text_splitter_schema.py ├── stages │ └── __init__.py └── util │ ├── __init__.py │ ├── flow_control │ ├── __init__.py │ └── test_filter_by_task.py │ ├── modules │ ├── __init__.py │ └── test_config_validator.py │ └── telemetry │ ├── __init__.py │ └── test_global_stats.py └── utilities_for_test.py /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # syntax=docker/dockerfile:1.3 5 | 6 | ARG BASE_IMG=nvcr.io/nvidia/cuda 7 | ARG BASE_IMG_TAG=12.4.1-base-ubuntu22.04 8 | 9 | # Use NVIDIA cuda 10 | FROM $BASE_IMG:$BASE_IMG_TAG AS base 11 | 12 | ARG RELEASE_TYPE="dev" 13 | ARG VERSION="" 14 | ARG VERSION_REV="0" 15 | 16 | # Install necessary dependencies using apt-get 17 | RUN apt-get update && apt-get install -y \ 18 | wget \ 19 | bzip2 \ 20 | ca-certificates \ 21 | curl \ 22 | libgl1-mesa-glx \ 23 | vim \ 24 | git \ 25 | && apt-get clean 26 | 27 | RUN wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O /tmp/miniforge.sh \ 28 | && bash /tmp/miniforge.sh -b -p /opt/conda \ 29 | && rm /tmp/miniforge.sh 30 | 31 | # Add conda to the PATH 32 | ENV PATH=/opt/conda/bin:$PATH 33 | 34 | # Install Mamba, a faster alternative to conda, within the base environment 35 | RUN conda install -y mamba -n base -c conda-forge 36 | 37 | COPY conda/environments/nv_ingest_environment.yml /workspace/nv_ingest_environment.yml 38 | 39 | # Create nv_ingest base environment 40 | RUN mamba env create -f /workspace/nv_ingest_environment.yml \ 41 | && conda clean --all --yes 42 | 43 | # Set default shell to bash 44 | SHELL ["/bin/bash", "-c"] 45 | 46 | # Activate the environment (make it default for subsequent commands) 47 | RUN echo "source activate nv_ingest_runtime" >> ~/.bashrc 48 | -------------------------------------------------------------------------------- /.devcontainer/README.md: -------------------------------------------------------------------------------- 1 | 17 | 18 | # NV-Ingest Devcontainer 19 | 20 | The nv-ingest devcontainer is provided as a quick-to-set-up development and exploration environment for use with [Visual Studio Code](https://code.visualstudio.com) (Code). The devcontainer is a lightweight container which mounts-in a Conda environment with cached packages, alleviating long Conda download times on subsequent launches. It provides a simple framework for adding developer-centric [scripts](#development-scripts), and incorporates some helpful Code plugins. 21 | 22 | > [!Note] 23 | > NV-Ingest is also known as NVIDIA Ingest and NeMo Retriever extraction. 24 | 25 | More information about devcontainers can be found at [`containers.dev`](https://containers.dev/). 26 | 27 | ## Getting Started 28 | 29 | To get started, simply open the nv-ingest repository root folder within Code. A window should appear at the bottom-right corner of the editor asking if you would like to reopen the workspace inside of the dev container. After clicking the confirmation dialog, the container will first build, then launch, then remote-attach. 30 | 31 | If the window does not appear, or you would like to rebuild the container, click ctrl-shift-p and search for `Dev Containers: Rebuild and Reopen in Container`. Hit enter, and the container will first build, then launch, then remote-attach. 32 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Documentation and examples 2 | docs/ @NVIDIA/nv-ingest-docs 3 | README.md @NVIDIA/nv-ingest-docs 4 | examples/ @NVIDIA/nv-ingest-docs 5 | 6 | # Devops 7 | .devcontainer/ @NVIDIA/nv-ingest-ops 8 | .github/ @NVIDIA/nv-ingest-ops 9 | .ci/ @NVIDIA/nv-ingest-ops 10 | 11 | # Global owners (required for all PRs) 12 | * @NVIDIA/nv-ingest-maintainers 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # GitHub info on config.yml 2 | # https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 3 | # Set to 'false' if you only want the templates to be used. 4 | blank_issues_enabled: true 5 | 6 | # When using discussions instead of Question issue templates, 7 | # link that below to have it show up in the 'Submit Issue' page 8 | contact_links: 9 | - name: Ask a Question 10 | url: https://github.com/nvidia/nv-ingest/discussions 11 | about: Please ask any questions here. 12 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | 5 | 6 | ## Checklist 7 | - [ ] I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/nv-ingest/blob/main/CONTRIBUTING.md). 8 | - [ ] New or existing tests cover these changes. 9 | - [ ] The documentation is up to date with these changes. 10 | - [ ] If adjusting docker-compose.yaml environment variables have you ensured those are mimicked in the Helm values.yaml file. 11 | -------------------------------------------------------------------------------- /.github/copy-pr-bot.yaml: -------------------------------------------------------------------------------- 1 | # Configuration file for `copy-pr-bot` GitHub App 2 | # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ 3 | 4 | enabled: true 5 | -------------------------------------------------------------------------------- /.github/workflows/conda-publish.yml: -------------------------------------------------------------------------------- 1 | name: Nv-Ingest Conda Package Publish 2 | 3 | # Trigger for pull requests and pushing to main 4 | on: 5 | schedule: 6 | # Runs every day at 11:30PM (UTC) 7 | - cron: "30 23 * * *" 8 | push: 9 | branches: 10 | - main 11 | workflow_dispatch: 12 | inputs: 13 | CONDA_CHANNEL: 14 | description: 'The RapidsAI Conda channel the package should be pushed to' 15 | required: true 16 | type: choice 17 | options: 18 | - dev 19 | - main 20 | VERSION: 21 | description: 'Version string for the release (e.g., 1.0.0)' 22 | required: true 23 | 24 | jobs: 25 | build: 26 | runs-on: linux-large-disk 27 | container: 28 | image: rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.10 29 | steps: 30 | - name: Checkout code 31 | uses: actions/checkout@v4 32 | with: 33 | ref: main 34 | 35 | - name: Determine CONDA_CHANNEL 36 | run: | 37 | echo "Github event_name: ${{ github.event_name }}" 38 | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then 39 | echo "Setting Conda channel to ${{ github.event.inputs.CONDA_CHANNEL }}" 40 | echo "CONDA_CHANNEL=${{ github.event.inputs.CONDA_CHANNEL }}" >> $GITHUB_ENV 41 | else 42 | echo "CONDA_CHANNEL=dev" >> $GITHUB_ENV 43 | fi 44 | 45 | # Build the Conda packages 46 | - name: Build Conda Packages 47 | run: | 48 | echo "Github event_name: ${{ github.event_name }}" 49 | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then 50 | echo "Building conda package for ${{ github.event.inputs.VERSION }}" 51 | RELEASE_VERSION="${{ github.event.inputs.VERSION }}" ./conda/build_conda_packages.sh 52 | else 53 | ./conda/build_conda_packages.sh 54 | fi 55 | 56 | # Publish nv-ingest conda packages 57 | - name: Publish conda package 58 | run: anaconda -t "${{ secrets.NVIDIA_CONDA_TOKEN }}" upload --force --label $CONDA_CHANNEL ./conda/output_conda_channel/linux-64/*.conda 59 | -------------------------------------------------------------------------------- /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Build NV-Ingest Runtime Image 2 | 3 | # Trigger for pull requests and pushing to main 4 | on: 5 | pull_request: 6 | types: 7 | - opened 8 | - synchronize 9 | - reopened 10 | push: 11 | branches: 12 | - main 13 | 14 | # Allows you to run this workflow manually from the Actions tab 15 | workflow_dispatch: 16 | 17 | jobs: 18 | build: 19 | runs-on: linux-large-disk 20 | 21 | steps: 22 | - name: Checkout code 23 | uses: actions/checkout@v4 24 | with: 25 | ref: main 26 | 27 | - name: Get current date (yyyy.mm.dd) 28 | run: echo "CURRENT_DATE=$(date +'%Y.%m.%d')" >> $GITHUB_ENV 29 | 30 | # Set up Docker Buildx, useful for building multi-platform images 31 | - name: Set up Docker Buildx 32 | uses: docker/setup-buildx-action@v3 33 | 34 | # Build the Docker image using the Dockerfile 35 | - name: Build Docker image 36 | run: | 37 | docker build --target runtime --build-arg GIT_COMMIT=${GITHUB_SHA} -t nv-ingest:latest . 38 | 39 | - name: Run Pytest inside Docker container 40 | run: | 41 | docker run nv-ingest:latest pytest -rs -m "not integration" --cov nv_ingest --cov nv_ingest_client --cov nv_ingest_api --cov-report term --cov-report xml:coverage.xml tests/service_tests client/client_tests api/api_tests 42 | 43 | - name: Upload test report 44 | uses: actions/upload-artifact@v4 45 | with: 46 | name: pytest-report 47 | path: coverage.xml 48 | -------------------------------------------------------------------------------- /.github/workflows/docker-nightly-publish.yml: -------------------------------------------------------------------------------- 1 | name: Nv-Ingest Nightly Container Publish 2 | 3 | # Trigger for pull requests and pushing to main 4 | on: 5 | schedule: 6 | # Runs every day at 11:30PM (UTC) 7 | - cron: "30 23 * * *" 8 | push: 9 | branches: 10 | - main 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | jobs: 15 | build: 16 | runs-on: linux-large-disk 17 | 18 | steps: 19 | - name: Checkout code 20 | uses: actions/checkout@v4 21 | with: 22 | ref: main 23 | 24 | - name: Get current date (yyyy.mm.dd) 25 | run: echo "CURRENT_DATE=$(date +'%Y.%m.%d')" >> $GITHUB_ENV 26 | 27 | # Set up Docker Buildx, useful for building multi-platform images 28 | - name: Set up Docker Buildx 29 | uses: docker/setup-buildx-action@v3 30 | 31 | # Build the Docker image using the Dockerfile 32 | - name: Build Docker image 33 | run: | 34 | docker build --target runtime --build-arg GIT_COMMIT=${GITHUB_SHA} -t ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.CURRENT_DATE }} . 35 | 36 | # Login to NGC 37 | - name: Log in to NGC Registry 38 | run: echo "${{ secrets.DOCKER_PASSWORD }}" | docker login nvcr.io --username "\$oauthtoken" --password-stdin 39 | 40 | # Push the container to NGC 41 | - name: Upload nv-ingest container 42 | run: docker push ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.CURRENT_DATE }} 43 | -------------------------------------------------------------------------------- /.github/workflows/docker-release-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Docker Image 2 | 3 | on: 4 | create: 5 | branches: 6 | - release/* 7 | 8 | jobs: 9 | build: 10 | runs-on: linux-large-disk 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v4 15 | 16 | # Extract branch name after "release/" 17 | - name: Extract branch name 18 | id: extract_branch 19 | run: | 20 | BRANCH_NAME=${GITHUB_REF#refs/heads/release/} 21 | echo "SHORT_BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV 22 | 23 | # Set up Docker Buildx, useful for building multi-platform images 24 | - name: Set up Docker Buildx 25 | uses: docker/setup-buildx-action@v3 26 | 27 | # Build the Docker image using the Dockerfile 28 | - name: Build Docker image 29 | run: | 30 | docker build --target runtime --build-arg GIT_COMMIT=${GITHUB_SHA} -t ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.SHORT_BRANCH_NAME }} . 31 | 32 | # Login to NGC 33 | - name: Log in to NGC Registry 34 | run: echo "${{ secrets.DOCKER_PASSWORD }}" | docker login nvcr.io --username "\$oauthtoken" --password-stdin 35 | 36 | # Push the container to NGC 37 | - name: Upload nv-ingest container 38 | run: docker push ${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ env.SHORT_BRANCH_NAME }} 39 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: nv-ingest pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out repository code 13 | uses: actions/checkout@v4 14 | with: 15 | ref: main 16 | - uses: actions/setup-python@v3 17 | - uses: pre-commit/action@v3.0.1 18 | -------------------------------------------------------------------------------- /.github/workflows/pypi-nightly-publish.yml: -------------------------------------------------------------------------------- 1 | name: Nv-Ingest Nightly PyPi Wheel Publish 2 | 3 | # Trigger for pull requests and pushing to main 4 | on: 5 | schedule: 6 | # Runs every day at 11:30PM (UTC) 7 | - cron: "30 23 * * *" 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | runs-on: linux-large-disk 13 | container: 14 | image: rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.10 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | with: 19 | ref: main 20 | 21 | - name: Install build dependencies 22 | run: | 23 | pip install build twine 24 | 25 | - name: Build nv-ingest-api wheel 26 | run: | 27 | cd api && python -m build 28 | 29 | - name: Build nv-ingest-client wheel 30 | run: | 31 | cd client && python -m build 32 | 33 | - name: Publish wheels to Artifactory 34 | env: 35 | ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }} 36 | ARTIFACTORY_USERNAME: ${{ secrets.ARTIFACTORY_USERNAME }} 37 | ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} 38 | run: | 39 | twine upload --repository-url $ARTIFACTORY_URL -u $ARTIFACTORY_USERNAME -p $ARTIFACTORY_PASSWORD api/dist/* \ 40 | && twine upload --repository-url $ARTIFACTORY_URL -u $ARTIFACTORY_USERNAME -p $ARTIFACTORY_PASSWORD client/dist/* 41 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | exclude: '(^(docs|data)/|\.md$)' 7 | - id: end-of-file-fixer 8 | - id: check-added-large-files 9 | args: [-- maxkb=1500] 10 | - id: check-ast 11 | - id: debug-statements 12 | 13 | - repo: https://github.com/psf/black 14 | rev: 24.10.0 15 | hooks: 16 | - id: black 17 | args: ["--line-length=120"] 18 | 19 | - repo: https://github.com/PyCQA/flake8 20 | rev: 7.1.1 21 | hooks: 22 | - id: flake8 23 | args: ["--max-line-length=120", "--extend-ignore=E203,E266,F403,F405"] 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # NVIDIA Ingest 24.08.0 2 | 3 | ## New Features 4 | 5 | - ... 6 | 7 | ## Improvements 8 | 9 | - ... 10 | 11 | ## Bug Fixes 12 | 13 | - ... 14 | -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | # Citation Guide 2 | 3 | ## To Cite NVIDIA Ingest 4 | If you use NVIDIA Ingest in a publication, please use citations in the following format (BibTeX entry for LaTeX): 5 | ```tex 6 | @Manual{, 7 | title = {NVIDIA Ingest: An accelerated pipeline for document ingestion}, 8 | author = {NVIDIA Ingest Development Team}, 9 | year = {2024}, 10 | url = {https://github.com/NVIDIA/nv-ingest}, 11 | } 12 | ``` 13 | 14 | 15 | ## Sample Citations: 16 | 17 | Using [RAPIDS](https://rapids.ai/) citations for reference. 18 | 19 | ### Bringing UMAP Closer to the Speed of Light
with GPU Acceleration 20 | ```tex 21 | @misc{ 22 | nolet2020bringing, 23 | title={Bringing UMAP Closer to the Speed of Light with GPU Acceleration}, 24 | author={Corey J. Nolet, Victor Lafargue, Edward Raff, Thejaswi Nanditale, Tim Oates, John Zedlewski, and Joshua Patterson}, 25 | year={2020}, 26 | eprint={2008.00325}, 27 | archivePrefix={arXiv}, 28 | primaryClass={cs.LG} 29 | } 30 | ``` 31 | 32 | ### Machine Learning in Python:
Main developments and technology trends in data science, machine learning, and artificial intelligence 33 | ```tex 34 | @article{ 35 | raschka2020machine, 36 | title={Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence}, 37 | author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey}, 38 | journal={Information}, 39 | volume={11}, 40 | number={4}, 41 | pages={193}, 42 | year={2020}, 43 | publisher={Multidisciplinary Digital Publishing Institute} 44 | } 45 | ``` 46 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. 4 | 5 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.** 6 | 7 | ## Reporting Potential Security Vulnerability in an NVIDIA Product 8 | 9 | To report a potential security vulnerability in any NVIDIA product: 10 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) 11 | - E-Mail: psirt@nvidia.com 12 | - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) 13 | - Please include the following information: 14 | - Product/Driver name and version/branch that contains the vulnerability 15 | - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) 16 | - Instructions to reproduce the vulnerability 17 | - Proof-of-concept or exploit code 18 | - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability 19 | 20 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. 21 | 22 | ## NVIDIA Product Security 23 | 24 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security 25 | -------------------------------------------------------------------------------- /api/MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude *.egg-info 2 | 3 | include README.md 4 | include LICENSE 5 | recursive-include src * 6 | global-exclude __pycache__ 7 | global-exclude *.pyc 8 | -------------------------------------------------------------------------------- /api/README.md: -------------------------------------------------------------------------------- 1 | # nv-ingest-api 2 | 3 | Provides a common set of 4 | 5 | - Pythonic Objects 6 | - Common Functions 7 | - Utilities 8 | - Core Logic 9 | 10 | Implemented in pure Python that can be imported and used directly or used as part of future frameworks and runtimes. 11 | -------------------------------------------------------------------------------- /api/api_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .utilities_for_test import * 6 | -------------------------------------------------------------------------------- /api/api_tests/import_checks.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | def check_morpheus_import(): 7 | try: 8 | import morpheus 9 | 10 | _ = morpheus._version 11 | 12 | return True 13 | except Exception as e: 14 | print(f"\nError: {e}\n", flush=True) 15 | return False 16 | 17 | 18 | def check_cuda_driver(): 19 | try: 20 | import cupy 21 | 22 | import cudf 23 | 24 | _ = cupy.cuda.runtime.driverGetVersion() 25 | _ = cudf.DataFrame({"a": [1, 2, 3]}) 26 | return True 27 | except Exception as e: 28 | print(f"\nError: {e}\n", flush=True) 29 | return False 30 | 31 | 32 | def check_adobe_import(): 33 | try: 34 | pass 35 | 36 | return True 37 | except ImportError: 38 | return False 39 | 40 | 41 | ADOBE_IMPORT_OK = check_adobe_import() 42 | CUDA_DRIVER_OK = check_cuda_driver() 43 | MORPHEUS_IMPORT_OK = check_morpheus_import() 44 | -------------------------------------------------------------------------------- /api/api_tests/interface/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/internal/__init__.py -------------------------------------------------------------------------------- /api/api_tests/internal/extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/internal/extract/__init__.py -------------------------------------------------------------------------------- /api/api_tests/internal/extract/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/extract/docx/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/extract/image/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/extract/image/test_image_extractor.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/extract/image/test_infographic_extractor.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/extract/image/test_table_extractor.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/internal/mutate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/internal/mutate/__init__.py -------------------------------------------------------------------------------- /api/api_tests/primitives/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/primitives/nim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/api_tests/primitives/nim/__init__.py -------------------------------------------------------------------------------- /api/api_tests/primitives/nim/model_interface/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/primitives/tracing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/converters/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/converters/test_type_mappings.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pytest 6 | 7 | from nv_ingest_api.internal.enums.common import ContentTypeEnum, DocumentTypeEnum 8 | from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "doc_type, expected_content_type", 13 | [ 14 | (DocumentTypeEnum.BMP, ContentTypeEnum.IMAGE), 15 | (DocumentTypeEnum.DOCX, ContentTypeEnum.STRUCTURED), 16 | (DocumentTypeEnum.HTML, ContentTypeEnum.TEXT), 17 | (DocumentTypeEnum.JPEG, ContentTypeEnum.IMAGE), 18 | (DocumentTypeEnum.PDF, ContentTypeEnum.STRUCTURED), 19 | (DocumentTypeEnum.PNG, ContentTypeEnum.IMAGE), 20 | (DocumentTypeEnum.PPTX, ContentTypeEnum.STRUCTURED), 21 | (DocumentTypeEnum.SVG, ContentTypeEnum.IMAGE), 22 | (DocumentTypeEnum.TXT, ContentTypeEnum.TEXT), 23 | ], 24 | ) 25 | def test_doc_type_to_content_type_valid(doc_type, expected_content_type): 26 | """ 27 | Test doc_type_to_content_type function with valid document types. 28 | """ 29 | assert ( 30 | doc_type_to_content_type(doc_type) == expected_content_type 31 | ), f"doc_type {doc_type} should map to content type {expected_content_type}" 32 | 33 | 34 | def test_doc_type_to_content_type_invalid(): 35 | """ 36 | Test doc_type_to_content_type function with an invalid document type. 37 | """ 38 | invalid_doc_type = "invalid_doc_type" # Assume this is not a valid DocumentTypeEnum value 39 | with pytest.raises(KeyError): 40 | doc_type_to_content_type(invalid_doc_type) 41 | -------------------------------------------------------------------------------- /api/api_tests/util/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/detectors/test_language.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pytest 6 | from langdetect import DetectorFactory 7 | 8 | from nv_ingest_api.util.detectors.language import LanguageEnum 9 | from nv_ingest_api.util.detectors.language import detect_language 10 | 11 | # Ensure langdetect produces consistent results 12 | DetectorFactory.seed = 0 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "text, expected_language", 17 | [ 18 | ("This is an English text.", LanguageEnum.EN), 19 | ("Este es un texto en español.", LanguageEnum.ES), 20 | # Add more examples as needed 21 | ], 22 | ) 23 | def test_detect_language_known_languages(text, expected_language): 24 | """ 25 | Test detect_language function with text in known languages. 26 | """ 27 | assert detect_language(text) == expected_language 28 | 29 | 30 | def test_detect_language_unknown_language(): 31 | """ 32 | Test detect_language function with text in an unknown language or not covered by LanguageEnum. 33 | """ 34 | unknown_text = "1234" # Assuming Japanese is not in LanguageEnum 35 | assert detect_language(unknown_text) == LanguageEnum.UNKNOWN 36 | 37 | 38 | @pytest.mark.parametrize( 39 | "invalid_input", 40 | [ 41 | 123, # Non-string input 42 | None, # NoneType 43 | ], 44 | ) 45 | def test_detect_language_invalid_input(invalid_input): 46 | """ 47 | Test detect_language function with invalid inputs. 48 | """ 49 | # Assuming the langdetect_exception_handler decorator returns LanguageEnum.UNKNOWN for invalid inputs 50 | with pytest.raises(TypeError): 51 | detect_language(invalid_input) 52 | -------------------------------------------------------------------------------- /api/api_tests/util/exception_handlers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/exception_handlers/test_converters.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from datetime import datetime 6 | from datetime import timedelta 7 | from datetime import timezone 8 | 9 | from nv_ingest_api.util.converters.datetools import datetools_exception_handler 10 | from nv_ingest_api.util.converters.datetools import remove_tz 11 | 12 | 13 | # Example functions to test the decorator 14 | @datetools_exception_handler 15 | def test_func_raises_exception(): 16 | raise ValueError("Test exception") 17 | 18 | 19 | @datetools_exception_handler 20 | def test_func_success(): 21 | return "Success" 22 | 23 | 24 | def test_datetools_exception_handler_with_exception(): 25 | """ 26 | Test the decorator with a function that raises an exception, 27 | checking that the returned date is within a few minutes of the current time. 28 | """ 29 | start_time = remove_tz(datetime.now(timezone.utc)) 30 | 31 | result = test_func_raises_exception() 32 | 33 | # Convert result back to datetime object for comparison 34 | result_datetime = datetime.fromisoformat(result) 35 | 36 | end_time = remove_tz(datetime.now(timezone.utc)) 37 | 38 | # Check the result is within a reasonable time delta (e.g., a few minutes) 39 | time_delta = timedelta(minutes=5) 40 | 41 | assert ( 42 | start_time - time_delta 43 | ) <= result_datetime, "The returned datetime should be within a few minutes of the current time" 44 | assert result_datetime <= ( 45 | end_time + time_delta 46 | ), "The returned datetime should be within a few minutes of the current time" 47 | 48 | 49 | def test_datetools_exception_handler_without_exception(): 50 | """ 51 | Test the decorator with a function that does not raise an exception. 52 | """ 53 | result = test_func_success() 54 | assert result == "Success", "Decorator should not interfere with the function's normal execution" 55 | -------------------------------------------------------------------------------- /api/api_tests/util/exception_handlers/test_detectors.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from langdetect.lang_detect_exception import LangDetectException 6 | 7 | from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler 8 | 9 | 10 | # Sample function to be decorated 11 | def sample_func(text): 12 | return "detected_language" 13 | 14 | 15 | # Sample function that raises LangDetectException 16 | def sample_func_raises_exception(text): 17 | raise LangDetectException("No features in text.") 18 | 19 | 20 | # Apply the decorator to test functions 21 | @langdetect_exception_handler 22 | def decorated_sample_func(text): 23 | return sample_func(text) 24 | 25 | 26 | @langdetect_exception_handler 27 | def decorated_func_raises_exception(text): 28 | return sample_func_raises_exception(text) 29 | 30 | 31 | def test_langdetect_exception_handler_success(): 32 | """ 33 | Test that the decorator correctly passes through the return value of the function when no exception is raised. 34 | """ 35 | result = decorated_sample_func("Test text") 36 | assert result == "detected_language", "The function should return the detected language." 37 | -------------------------------------------------------------------------------- /api/api_tests/util/exception_handlers/test_pdf.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import re 6 | from unittest.mock import patch 7 | 8 | import pytest 9 | 10 | from nv_ingest_api.internal.enums.common import StatusEnum, TaskTypeEnum 11 | from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler, create_exception_tag 12 | 13 | MODULE_UNDER_TEST = "nv_ingest_api.util.exception_handlers.pdf" 14 | 15 | 16 | @pdfium_exception_handler(descriptor="pdfium Error") 17 | def sample_func(): 18 | raise Exception("Sample error") 19 | 20 | 21 | @pytest.fixture 22 | def mock_logger(): 23 | with patch(f"{MODULE_UNDER_TEST}.logger") as mock: 24 | yield mock 25 | 26 | 27 | def test_pdfium_exception_handler(mock_logger): 28 | result = sample_func() 29 | assert result == [], "The function should return an empty list on exception." 30 | mock_logger.warning.assert_called_once_with("pdfium Error:sample_func error:Sample error") 31 | 32 | 33 | def test_create_exception_tag_with_source_id(): 34 | source_id = "test_id" 35 | error_message = "test_error" 36 | result = create_exception_tag(error_message, source_id=source_id) 37 | 38 | expected_metadata = { 39 | "task": TaskTypeEnum.EXTRACT, 40 | "status": StatusEnum.ERROR, 41 | "source_id": source_id, 42 | "error_msg": error_message, 43 | } 44 | 45 | # Assuming validate_schema function works as intended or is mocked accordingly 46 | assert result[0][0] is None 47 | assert result[0][1]["error_metadata"] == expected_metadata 48 | 49 | 50 | def test_create_exception_tag_without_source_id(): 51 | error_message = "test_error" 52 | 53 | with pytest.raises( 54 | ValueError, 55 | match=re.escape( 56 | "1 validation error for MetadataSchema\n" 57 | "error_metadata.source_id\n Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]" # noqa: W505, E501 58 | ), 59 | ): 60 | create_exception_tag(error_message) 61 | -------------------------------------------------------------------------------- /api/api_tests/util/exception_handlers/test_schemas.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from unittest.mock import patch 6 | 7 | import pytest 8 | from pydantic import BaseModel 9 | 10 | from nv_ingest_api.util.exception_handlers.schemas import schema_exception_handler 11 | 12 | MODULE_UNDER_TEST = "nv_ingest_api.util.exception_handlers.schemas" 13 | 14 | 15 | class SimpleModel(BaseModel): 16 | name: str 17 | 18 | 19 | @schema_exception_handler 20 | def function_success(): 21 | return "Success" 22 | 23 | 24 | @schema_exception_handler 25 | def function_fail(): 26 | # Intentionally missing the 'name' field to trigger a ValidationError 27 | SimpleModel() 28 | 29 | 30 | def test_schema_exception_handler_success(): 31 | """ 32 | Test that the decorator does not interfere with the normal execution of a function. 33 | """ 34 | result = function_success() 35 | assert result == "Success", "The function should successfully return 'Success'." 36 | 37 | 38 | @patch(f"{MODULE_UNDER_TEST}.logger") 39 | def test_schema_exception_handler_with_validation_error(mock_logger): 40 | """ 41 | Test that the decorator correctly handles a ValidationError and logs the expected message. 42 | """ 43 | with pytest.raises(ValueError) as exc_info: 44 | function_fail() 45 | 46 | # Verify the correct error message was logged 47 | expected_error_message = "Invalid configuration: name: Field required" 48 | mock_logger.error.assert_called_once_with(expected_error_message) 49 | 50 | # Verify the ValueError contains the correct message 51 | assert str(exc_info.value) == expected_error_message, "A ValueError with the correct message should be raised." 52 | -------------------------------------------------------------------------------- /api/api_tests/util/image_processing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/image_processing/test_clustering.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest_api.util.image_processing.clustering import ( 6 | boxes_are_close_or_overlap, 7 | group_bounding_boxes, 8 | combine_groups_into_bboxes, 9 | remove_superset_bboxes, 10 | ) 11 | 12 | 13 | def test_boxes_are_close_or_overlap(): 14 | from_box = [0, 0, 10, 10] 15 | to_box = [15, 15, 25, 25] 16 | assert not boxes_are_close_or_overlap(from_box, to_box, threshold=1) 17 | assert boxes_are_close_or_overlap(from_box, to_box, threshold=5) 18 | 19 | 20 | def test_group_bounding_boxes(): 21 | boxes = [[0, 0, 10, 10], [10, 10, 20, 20], [100, 100, 110, 110]] 22 | # The second and third boxes should group together 23 | groups = group_bounding_boxes(boxes, threshold=2) 24 | assert len(groups) == 2 25 | assert sorted(groups[0]) == [0, 1] 26 | assert sorted(groups[1]) == [2] 27 | 28 | 29 | def test_combine_groups_into_bboxes(): 30 | boxes = [[0, 0, 1, 1], [2, 2, 3, 3], [1, 1, 2, 2]] 31 | groups = [[0], [1, 2]] 32 | combined = combine_groups_into_bboxes(boxes, groups) 33 | assert len(combined) == 2 34 | assert combined[1] == [1, 1, 3, 3] 35 | 36 | 37 | def test_remove_superset_bboxes(): 38 | bboxes = [[0, 0, 10, 10], [2, 2, 4, 4], [3, 3, 5, 5]] 39 | # The first box encloses the second but not the third strictly 40 | out = remove_superset_bboxes(bboxes) 41 | assert len(out) == 2 42 | assert [0, 0, 10, 10] not in out 43 | -------------------------------------------------------------------------------- /api/api_tests/util/logging/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/message_brokers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/message_brokers/redis/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/message_brokers/simple_message_broker/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/api_tests/util/schema/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "nv-ingest-api" 7 | description = "Python module with core document ingestion functions." 8 | dynamic = ["version"] # Declare attrs that will be generated at build time 9 | readme = "README.md" 10 | authors = [ 11 | {name = "Jeremy Dyer", email = "jdyer@nvidia.com"} 12 | ] 13 | license = {file = "LICENSE"} 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "pandas>=2.0", 21 | "pydantic>2.0.0", 22 | "pydantic-settings>2.0.0", 23 | ] 24 | 25 | [project.urls] 26 | homepage = "https://github.com/NVIDIA/nv-ingest" 27 | repository = "https://github.com/NVIDIA/nv-ingest" 28 | documentation = "https://docs.nvidia.com/nv-ingest" 29 | 30 | [tool.setuptools.packages.find] 31 | where = ["src"] 32 | 33 | [tool.setuptools.dynamic] 34 | version = {attr = "version.get_version"} 35 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/enums/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/docx/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2024, NVIDIA CORPORATION. 6 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/docx/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/extract/docx/engines/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/image/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/pdf/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .adobe import adobe_extractor 6 | from .llama import llama_parse_extractor 7 | from .nemoretriever import nemoretriever_parse_extractor 8 | from .pdfium import pdfium_extractor 9 | from .tika import tika_extractor 10 | from .unstructured_io import unstructured_io_extractor 11 | 12 | __all__ = [ 13 | "adobe_extractor", 14 | "llama_parse_extractor", 15 | "nemoretriever_parse_extractor", 16 | "pdfium_extractor", 17 | "tika_extractor", 18 | "unstructured_io_extractor", 19 | ] 20 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/pptx/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2024, NVIDIA CORPORATION. 6 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/mutate/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/primitives/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/control_message_task.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from uuid import UUID 6 | 7 | from pydantic import BaseModel, Field, ConfigDict 8 | from typing import Any, Dict, Union 9 | 10 | 11 | class ControlMessageTask(BaseModel): 12 | model_config = ConfigDict(extra="forbid") 13 | 14 | type: str 15 | id: Union[str, UUID] 16 | properties: Dict[str, Any] = Field(default_factory=dict) 17 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/nim/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .nim_client import NimClient 6 | from .nim_model_interface import ModelInterface 7 | 8 | __all__ = ["NimClient", "ModelInterface"] 9 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/nim/default_values.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2024, NVIDIA CORPORATION. 6 | 7 | 8 | YOLOX_MAX_BATCH_SIZE = 8 9 | YOLOX_MAX_WIDTH = 1536 10 | YOLOX_MAX_HEIGHT = 1536 11 | YOLOX_NUM_CLASSES = 3 12 | YOLOX_CONF_THRESHOLD = 0.01 13 | YOLOX_IOU_THRESHOLD = 0.5 14 | YOLOX_MIN_SCORE = 0.1 15 | YOLOX_FINAL_SCORE = 0.48 16 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import logging 6 | from functools import wraps 7 | from multiprocessing import Lock 8 | from multiprocessing import Manager 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | # Create a shared manager and lock for thread-safe access 13 | manager = Manager() 14 | global_cache = manager.dict() 15 | lock = Lock() 16 | 17 | 18 | def multiprocessing_cache(max_calls): 19 | """ 20 | A decorator that creates a global cache shared between multiple processes. 21 | The cache is invalidated after `max_calls` number of accesses. 22 | 23 | Args: 24 | max_calls (int): The number of calls after which the cache is cleared. 25 | 26 | Returns: 27 | function: The decorated function with global cache and invalidation logic. 28 | """ 29 | 30 | def decorator(func): 31 | call_count = manager.Value("i", 0) # Shared integer for call counting 32 | 33 | @wraps(func) 34 | def wrapper(*args, **kwargs): 35 | key = (func.__name__, args, frozenset(kwargs.items())) 36 | 37 | with lock: 38 | call_count.value += 1 39 | 40 | if call_count.value > max_calls: 41 | global_cache.clear() 42 | call_count.value = 0 43 | 44 | if key in global_cache: 45 | return global_cache[key] 46 | 47 | result = func(*args, **kwargs) 48 | 49 | with lock: 50 | global_cache[key] = result 51 | 52 | return result 53 | 54 | return wrapper 55 | 56 | return decorator 57 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/primitives/tracing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/internal/primitives/tracing/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/extract/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from typing import Optional, Literal 7 | 8 | from pydantic import Field, BaseModel 9 | from typing_extensions import Annotated 10 | 11 | 12 | class MessageBrokerClientSchema(BaseModel): 13 | host: str = "redis" 14 | port: Annotated[int, Field(gt=0, lt=65536)] = 6379 15 | 16 | # Update this for new broker types 17 | client_type: Literal["redis", "simple"] = "redis" # Restrict to 'redis' or 'simple' 18 | 19 | broker_params: Optional[dict] = Field(default_factory=dict) 20 | 21 | connection_timeout: Optional[Annotated[int, Field(ge=0)]] = 300 22 | max_backoff: Optional[Annotated[int, Field(ge=0)]] = 300 23 | max_retries: Optional[Annotated[int, Field(ge=0)]] = 0 24 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | from typing import Optional 8 | 9 | from pydantic import ConfigDict, BaseModel 10 | from pydantic import Field 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # Define schemas for request validation 16 | class PushRequestSchema(BaseModel): 17 | command: str 18 | queue_name: str = Field(..., min_length=1) 19 | message: str = Field(..., min_length=1) 20 | timeout: Optional[float] = 100 # Optional timeout for blocking push 21 | model_config = ConfigDict(extra="forbid") 22 | 23 | 24 | class PopRequestSchema(BaseModel): 25 | command: str 26 | queue_name: str = Field(..., min_length=1) 27 | timeout: Optional[float] = 100 # Optional timeout for blocking pop 28 | model_config = ConfigDict(extra="forbid") 29 | 30 | 31 | class SizeRequestSchema(BaseModel): 32 | command: str 33 | queue_name: str = Field(..., min_length=1) 34 | model_config = ConfigDict(extra="forbid") 35 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # NOTE: This code is duplicated from the ingest service: 6 | # src/nv_ingest_client/schemas/response_schema.py 7 | # Eventually we should move all client wrappers for the message broker into a shared library that both the ingest 8 | # service and the client can use. 9 | 10 | from typing import Optional, Union 11 | from pydantic import BaseModel 12 | 13 | 14 | class ResponseSchema(BaseModel): 15 | response_code: int 16 | response_reason: Optional[str] = "OK" 17 | response: Union[str, dict, None] = None 18 | trace_id: Optional[str] = None # Unique trace ID 19 | transaction_id: Optional[str] = None # Unique transaction ID 20 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/meta/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import ConfigDict, BaseModel 7 | 8 | 9 | # Define a base class with extra fields forbidden 10 | class BaseModelNoExt(BaseModel): 11 | model_config = ConfigDict(extra="forbid") 12 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/mutate/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | 8 | from pydantic import ConfigDict, BaseModel 9 | from pydantic import StrictBool 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class ImageDedupSchema(BaseModel): 15 | raise_on_failure: StrictBool = False 16 | model_config = ConfigDict(extra="forbid") 17 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/store/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2022-2024, NVIDIA CORPORATION. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import logging 20 | 21 | from pydantic import ConfigDict, BaseModel 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class EmbeddingStorageSchema(BaseModel): 27 | raise_on_failure: bool = False 28 | model_config = ConfigDict(extra="forbid") 29 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/store/store_image_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2022-2024, NVIDIA CORPORATION. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import logging 20 | 21 | from pydantic import ConfigDict, BaseModel 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class ImageStorageModuleSchema(BaseModel): 27 | structured: bool = True 28 | images: bool = True 29 | raise_on_failure: bool = False 30 | model_config = ConfigDict(extra="forbid") 31 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/transform/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import ConfigDict, BaseModel 7 | 8 | 9 | class ImageCaptionExtractionSchema(BaseModel): 10 | api_key: str = "api_key" 11 | endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions" 12 | prompt: str = "Caption the content of this image:" 13 | model_name: str = "meta/llama-3.2-11b-vision-instruct" 14 | raise_on_failure: bool = False 15 | model_config = ConfigDict(extra="forbid") 16 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | 8 | from pydantic import ConfigDict, BaseModel 9 | from pydantic import StrictBool 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class ImageFilterSchema(BaseModel): 15 | raise_on_failure: StrictBool = False 16 | cpu_only: StrictBool = False 17 | model_config = ConfigDict(extra="forbid") 18 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | 8 | from pydantic import ConfigDict, BaseModel 9 | 10 | from nv_ingest_api.util.logging.configuration import LogLevel 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class TextEmbeddingSchema(BaseModel): 16 | api_key: str = "api_key" 17 | batch_size: int = 4 18 | embedding_model: str = "nvidia/nv-embedqa-e5-v5" 19 | embedding_nim_endpoint: str = "http://embedding:8000/v1" 20 | encoding_format: str = "float" 21 | httpx_log_level: LogLevel = LogLevel.WARNING 22 | input_type: str = "passage" 23 | raise_on_failure: bool = False 24 | truncate: str = "END" 25 | model_config = ConfigDict(extra="forbid") 26 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from pydantic import Field, BaseModel, field_validator 6 | 7 | from typing import Optional 8 | 9 | from typing_extensions import Annotated 10 | 11 | 12 | class TextSplitterSchema(BaseModel): 13 | tokenizer: Optional[str] = None 14 | chunk_size: Annotated[int, Field(gt=0)] = 1024 15 | chunk_overlap: Annotated[int, Field(ge=0)] = 150 16 | raise_on_failure: bool = False 17 | 18 | @field_validator("chunk_overlap") 19 | def check_chunk_overlap(cls, v, values, **kwargs): 20 | if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]: 21 | raise ValueError("chunk_overlap must be less than chunk_size") 22 | return v 23 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/store/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/internal/transform/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/control_message/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/control_message/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/control_message/validators.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage 6 | 7 | 8 | def cm_ensure_payload_not_null(control_message: IngestControlMessage): 9 | """ 10 | Ensures that the payload of a IngestControlMessage is not None. 11 | 12 | Parameters 13 | ---------- 14 | control_message : IngestControlMessage 15 | The IngestControlMessage to check. 16 | 17 | Raises 18 | ------ 19 | ValueError 20 | If the payload is None. 21 | """ 22 | 23 | if control_message.payload() is None: 24 | raise ValueError("Payload cannot be None") 25 | 26 | 27 | def cm_set_failure(control_message: IngestControlMessage, reason: str) -> IngestControlMessage: 28 | """ 29 | Sets the failure metadata on a IngestControlMessage. 30 | 31 | Parameters 32 | ---------- 33 | control_message : IngestControlMessage 34 | The IngestControlMessage to set the failure metadata on. 35 | reason : str 36 | The reason for the failure. 37 | 38 | Returns 39 | ------- 40 | control_message : IngestControlMessage 41 | The modified IngestControlMessage with the failure metadata set. 42 | """ 43 | 44 | control_message.set_metadata("cm_failed", True) 45 | control_message.set_metadata("cm_failed_reason", reason) 46 | 47 | return control_message 48 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/converters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/converters/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/converters/bytetools.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import base64 7 | 8 | 9 | def bytesfromhex(hex_input): 10 | """ 11 | Function to convert hex to bytes. 12 | 13 | Parameters 14 | ---------- 15 | hex_input : hex 16 | Hex string to store bytes in cuDF. 17 | 18 | Returns 19 | ------- 20 | bytes 21 | Hex encoded object converted to bytes. 22 | """ 23 | 24 | return bytes.fromhex(hex_input) 25 | 26 | 27 | def hexfrombytes(bytes_input): 28 | """ 29 | Function to bytes to hex string. 30 | 31 | Parameters 32 | ---------- 33 | bytes_input : bytes 34 | Raw bytes of object. 35 | 36 | Returns 37 | ------- 38 | hex 39 | Hex string to store bytes in cuDF. 40 | """ 41 | 42 | return bytes_input.hex() 43 | 44 | 45 | def bytesfrombase64(base64_input): 46 | """ 47 | Function to convert base64 encoded string to bytes. 48 | 49 | Parameters 50 | ---------- 51 | base64_input : hex 52 | Base64 encoded string to store bytes in cuDF. 53 | 54 | Returns 55 | ------- 56 | bytes 57 | Base64 encoded string converted to bytes. 58 | """ 59 | 60 | return base64.b64decode(base64_input) 61 | 62 | 63 | def base64frombytes(bytes_input, encoding="utf-8"): 64 | """ 65 | Function to bytes to base64 string. 66 | 67 | Parameters 68 | ---------- 69 | bytes_input : bytes 70 | Raw bytes of object. 71 | 72 | Returns 73 | ------- 74 | base64 75 | base64 encoded string to store bytes in cuDF. 76 | """ 77 | 78 | return base64.b64encode(bytes_input).decode(encoding) 79 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/converters/type_mappings.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum 5 | from nv_ingest_api.internal.enums.common import ContentTypeEnum 6 | 7 | DOC_TO_CONTENT_MAP = { 8 | DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE, 9 | DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED, 10 | DocumentTypeEnum.HTML: ContentTypeEnum.TEXT, 11 | DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE, 12 | DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO, 13 | DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED, 14 | DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE, 15 | DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED, 16 | DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE, 17 | DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE, 18 | DocumentTypeEnum.TXT: ContentTypeEnum.TEXT, 19 | DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO, 20 | } 21 | 22 | 23 | def doc_type_to_content_type(doc_type: DocumentTypeEnum) -> ContentTypeEnum: 24 | """ 25 | Convert DocumentTypeEnum to ContentTypeEnum 26 | """ 27 | return DOC_TO_CONTENT_MAP[doc_type] 28 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2024, NVIDIA CORPORATION. 6 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/detectors/language.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import langdetect 7 | 8 | from nv_ingest_api.internal.enums.common import LanguageEnum 9 | from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler 10 | 11 | 12 | @langdetect_exception_handler 13 | def detect_language(text): 14 | """ 15 | Detect spoken language from a string of text. 16 | 17 | Parameters 18 | ---------- 19 | text : str 20 | A string of text. 21 | 22 | Returns 23 | ------- 24 | LanguageEnum 25 | A value from `LanguageEnum` detected language code. 26 | """ 27 | 28 | try: 29 | language = langdetect.detect(text) 30 | 31 | if LanguageEnum.has_value(language): 32 | language = LanguageEnum[language.upper().replace("-", "_")] 33 | else: 34 | language = LanguageEnum.UNKNOWN 35 | except langdetect.lang_detect_exception.LangDetectException: 36 | language = LanguageEnum.UNKNOWN 37 | 38 | return language 39 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/exception_handlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/exception_handlers/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/image_processing/__init__.py: -------------------------------------------------------------------------------- 1 | from .transforms import scale_image_to_encoding_size 2 | 3 | __all__ = [ 4 | "scale_image_to_encoding_size", 5 | ] 6 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/logging/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/logging/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/logging/configuration.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | import sys 8 | from enum import Enum 9 | 10 | 11 | class LogLevel(str, Enum): 12 | DEBUG = "DEBUG" 13 | INFO = "INFO" 14 | WARNING = "WARNING" 15 | ERROR = "ERROR" 16 | CRITICAL = "CRITICAL" 17 | 18 | 19 | def configure_logging(logger, level_name): 20 | """ 21 | Parameters: 22 | - level_name (str): The name of the logging level (e.g., "DEBUG", "INFO"). 23 | """ 24 | 25 | numeric_level = getattr(logging, level_name, None) 26 | if not isinstance(numeric_level, int): 27 | raise ValueError(f"Invalid log level: {level_name}") 28 | 29 | logging.StreamHandler(sys.stdout) 30 | logging.basicConfig(level=numeric_level, format="%(asctime)s - %(levelname)s - %(message)s") 31 | logger.setLevel(numeric_level) 32 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/message_brokers/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .broker import SimpleMessageBroker 6 | from .broker import ResponseSchema 7 | from .simple_client import SimpleClient 8 | 9 | __all__ = ["SimpleMessageBroker", "SimpleClient", "ResponseSchema"] 10 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/metadata/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Copyright (c) 2024, NVIDIA CORPORATION. 6 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/multi_processing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from .mp_pool_singleton import ProcessWorkerPoolSingleton 7 | 8 | __all__ = ["ProcessWorkerPoolSingleton"] 9 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/nim/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from typing import Tuple, Optional 6 | 7 | from nv_ingest_api.internal.primitives.nim.nim_client import NimClient 8 | from nv_ingest_api.internal.primitives.nim.nim_model_interface import ModelInterface 9 | 10 | __all__ = ["create_inference_client"] 11 | 12 | 13 | def create_inference_client( 14 | endpoints: Tuple[str, str], 15 | model_interface: ModelInterface, 16 | auth_token: Optional[str] = None, 17 | infer_protocol: Optional[str] = None, 18 | timeout: float = 120.0, 19 | max_retries: int = 5, 20 | ) -> NimClient: 21 | """ 22 | Create a NimClient for interfacing with a model inference server. 23 | 24 | Parameters 25 | ---------- 26 | endpoints : tuple 27 | A tuple containing the gRPC and HTTP endpoints. 28 | model_interface : ModelInterface 29 | The model interface implementation to use. 30 | auth_token : str, optional 31 | Authorization token for HTTP requests (default: None). 32 | infer_protocol : str, optional 33 | The protocol to use ("grpc" or "http"). If not specified, it is inferred from the endpoints. 34 | 35 | Returns 36 | ------- 37 | NimClient 38 | The initialized NimClient. 39 | 40 | Raises 41 | ------ 42 | ValueError 43 | If an invalid infer_protocol is specified. 44 | """ 45 | 46 | grpc_endpoint, http_endpoint = endpoints 47 | 48 | if (infer_protocol is None) and (grpc_endpoint and grpc_endpoint.strip()): 49 | infer_protocol = "grpc" 50 | elif infer_protocol is None and http_endpoint: 51 | infer_protocol = "http" 52 | 53 | if infer_protocol not in ["grpc", "http"]: 54 | raise ValueError("Invalid infer_protocol specified. Must be 'grpc' or 'http'.") 55 | 56 | return NimClient(model_interface, infer_protocol, endpoints, auth_token, timeout, max_retries) 57 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/pdf/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/schema/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/schema/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/schema/schema_validator.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest_api.util.exception_handlers.schemas import schema_exception_handler 6 | 7 | 8 | @schema_exception_handler 9 | def validate_schema(metadata, Schema): 10 | return Schema(**metadata) 11 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/service_clients/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/service_clients/kafka/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/service_clients/redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/service_clients/redis/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/service_clients/rest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/api/src/nv_ingest_api/util/service_clients/rest/__init__.py -------------------------------------------------------------------------------- /api/src/nv_ingest_api/util/string_processing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import logging 6 | import re 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | DEPLOT_MAX_TOKENS = 128 11 | DEPLOT_TEMPERATURE = 1.0 12 | DEPLOT_TOP_P = 1.0 13 | 14 | 15 | def remove_url_endpoints(url) -> str: 16 | """Some configurations provide the full endpoint in the URL. 17 | Ex: http://deplot:8000/v1/chat/completions. For hitting the 18 | health endpoint we need to get just the hostname:port combo 19 | that we can append the health/ready endpoint to so we attempt 20 | to parse that information here. 21 | 22 | Args: 23 | url str: Incoming URL 24 | 25 | Returns: 26 | str: URL with just the hostname:port portion remaining 27 | """ 28 | if "/v1" in url: 29 | url = url.split("/v1")[0] 30 | 31 | return url 32 | 33 | 34 | def generate_url(url) -> str: 35 | """Examines the user defined URL for http*://. If that 36 | pattern is detected the URL is used as provided by the user. 37 | If that pattern does not exist then the assumption is made that 38 | the endpoint is simply `http://` and that is prepended 39 | to the user supplied endpoint. 40 | 41 | Args: 42 | url str: Endpoint where the Rest service is running 43 | 44 | Returns: 45 | str: Fully validated URL 46 | """ 47 | if not re.match(r"^https?://", url): 48 | # Add the default `http://` if it's not already present in the URL 49 | url = f"http://{url}" 50 | 51 | return url 52 | -------------------------------------------------------------------------------- /api/src/version.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import datetime 7 | import os 8 | import re 9 | 10 | 11 | def get_version(): 12 | release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev") 13 | version = os.getenv("NV_INGEST_VERSION") 14 | rev = os.getenv("NV_INGEST_REV", "0") 15 | 16 | if not version: 17 | version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}" 18 | 19 | # Ensure the version is PEP 440 compatible 20 | pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$" 21 | if not re.match(pep440_regex, version): 22 | raise ValueError(f"Version '{version}' is not PEP 440 compatible") 23 | 24 | # Construct the final version string 25 | if release_type == "dev": 26 | # If rev is not specified and defaults to 0 lets create a more meaningful development 27 | # identifier that is pep440 compliant 28 | if int(rev) == 0: 29 | rev = datetime.datetime.now().strftime("%Y%m%d") 30 | final_version = f"{version}.dev{rev}" 31 | elif release_type == "release": 32 | final_version = f"{version}.post{rev}" if int(rev) > 0 else version 33 | else: 34 | raise ValueError(f"Invalid release type: {release_type}") 35 | 36 | return final_version 37 | -------------------------------------------------------------------------------- /ci/scripts/build_pip_packages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to display usage 4 | usage() { 5 | echo "Usage: $0 --type --lib " 6 | exit 1 7 | } 8 | 9 | # Get the directory of the current script 10 | SCRIPT_DIR=$(dirname "$(realpath "$0")") 11 | 12 | # Parse options 13 | while [[ "$#" -gt 0 ]]; do 14 | case $1 in 15 | --type) TYPE="$2"; shift ;; 16 | --lib) LIBRARY="$2"; shift ;; 17 | *) usage ;; 18 | esac 19 | shift 20 | done 21 | 22 | # Validate input 23 | if [[ -z "$TYPE" || -z "$LIBRARY" ]]; then 24 | usage 25 | fi 26 | 27 | # Get current date 28 | DATE=$(date +'%Y.%m.%d') 29 | 30 | # Set the version based on the build type 31 | if [[ "$TYPE" == "dev" ]]; then 32 | VERSION_SUFFIX="${DATE}-dev" 33 | elif [[ "$TYPE" == "release" ]]; then 34 | VERSION_SUFFIX="${DATE}" 35 | else 36 | echo "Invalid type: $TYPE" 37 | usage 38 | fi 39 | 40 | # Set library-specific variables and paths 41 | if [[ "$LIBRARY" == "api" ]]; then 42 | NV_INGEST_VERSION_OVERRIDE="${VERSION_SUFFIX}" 43 | export NV_INGEST_VERSION_OVERRIDE 44 | SETUP_PATH="$SCRIPT_DIR/../../api/pyproject.toml" 45 | (cd "$(dirname "$SETUP_PATH")" && python -m build) 46 | elif [[ "$LIBRARY" == "client" ]]; then 47 | NV_INGEST_VERSION_OVERRIDE="${VERSION_SUFFIX}" 48 | export NV_INGEST_VERSION_OVERRIDE 49 | SETUP_PATH="$SCRIPT_DIR/../../client/pyproject.toml" 50 | (cd "$(dirname "$SETUP_PATH")" && python -m build) 51 | elif [[ "$LIBRARY" == "service" ]]; then 52 | NV_INGEST_SERVICE_VERSION_OVERRIDE="${VERSION_SUFFIX}" 53 | export NV_INGEST_SERVICE_VERSION_OVERRIDE 54 | SETUP_PATH="$SCRIPT_DIR/../../setup.py" 55 | (cd "$(dirname "$SETUP_PATH")" && python setup.py sdist bdist_wheel) 56 | else 57 | echo "Invalid library: $LIBRARY" 58 | usage 59 | fi 60 | -------------------------------------------------------------------------------- /client/MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude *.egg-info 2 | 3 | include README.md 4 | include LICENSE 5 | recursive-include src/nv_ingest_client 6 | include src/version.py 7 | global-exclude __pycache__ 8 | global-exclude *.pyc 9 | -------------------------------------------------------------------------------- /client/client_examples/docker/Dockerfile.client: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | # Use tini init for container 4 | ENV TINI_VERSION v0.19.0 5 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /bin/tini 6 | RUN chmod +x /bin/tini 7 | 8 | # Include files to launch jupyter notebook 9 | RUN mkdir -p /workspace/docker 10 | COPY docker/start-jupyter.sh /workspace/docker/start-jupyter.sh 11 | COPY docker/entrypoint.sh /workspace/docker/entrypoint.sh 12 | RUN chmod +x /workspace/docker/start-jupyter.sh 13 | RUN chmod +x /workspace/docker/entrypoint.sh 14 | 15 | # Install some dependencies and useful utiliites 16 | RUN apt update && apt install -y python3-pip git tree \ 17 | && rm -rf /var/lib/apt/lists/* 18 | 19 | # Install the nv-ingest client library 20 | RUN cd /workspace \ 21 | && git clone https://github.com/NVIDIA/nv-ingest.git \ 22 | && cd /workspace/nv-ingest/client \ 23 | && pip install . 24 | COPY examples /workspace/client_examples/examples 25 | 26 | # Install jupyter lab 27 | RUN pip install jupyterlab 28 | 29 | WORKDIR /workspace/client_examples 30 | 31 | ENTRYPOINT [ "/bin/tini", "--", "/workspace/docker/entrypoint.sh" ] 32 | -------------------------------------------------------------------------------- /client/client_examples/docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start jupyter server 4 | /workspace/docker/start-jupyter.sh > /dev/null 5 | echo "There was a jupyter-lab instance started on port 8888, http://127.0.0.1:8888" 6 | 7 | # Run whatever user wants 8 | exec "$@" 9 | -------------------------------------------------------------------------------- /client/client_examples/docker/start-jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nohup jupyter-lab --allow-root --ip=0.0.0.0 --port=8888 --no-browser --NotebookApp.token='' > /dev/null 2>&1 & 3 | -------------------------------------------------------------------------------- /client/client_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/cli/test_nv_ingest_cli.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # TODO(Devin): Just for coverage at the moment 6 | import nv_ingest_client.nv_ingest_cli as nv_ingest_cli # noqa: F401 7 | -------------------------------------------------------------------------------- /client/client_tests/cli/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/client/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/client/test_rest_client.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from unittest.mock import MagicMock 6 | 7 | import pytest 8 | 9 | from nv_ingest_api.util.service_clients.rest.rest_client import RestClient 10 | 11 | 12 | class MockRestClient: 13 | def __init__(self, host, port): 14 | self.host = host 15 | self.port = port 16 | self.counter = 0 17 | 18 | def get_client(self): 19 | return self 20 | 21 | 22 | @pytest.fixture 23 | def mock_rest_client_allocator(): 24 | return MagicMock(return_value=MockRestClient("localhost", 7670)) 25 | 26 | 27 | @pytest.fixture 28 | def rest_client(mock_rest_client_allocator): 29 | return RestClient( 30 | host="localhost", 31 | port=7670, 32 | max_retries=0, 33 | max_backoff=32, 34 | http_allocator=mock_rest_client_allocator, 35 | ) 36 | 37 | 38 | # Test generate_url function 39 | def test_generate_url(rest_client): 40 | assert rest_client._generate_url("localhost", 7670) == "http://localhost:7670" 41 | assert rest_client._generate_url("http://localhost", 7670) == "http://localhost:7670" 42 | assert rest_client._generate_url("https://localhost", 7670) == "https://localhost:7670" 43 | 44 | # A few more complicated and possible tricks 45 | assert rest_client._generate_url("localhost-https-else", 7670) == "http://localhost-https-else:7670" 46 | -------------------------------------------------------------------------------- /client/client_tests/primitives/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/primitives/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/primitives/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/client_tests/primitives/tasks/test_store_embed.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pytest 6 | from nv_ingest_client.primitives.tasks.store import StoreEmbedTask 7 | 8 | # Initialization and Property Setting 9 | 10 | 11 | def test_store_task_initialization(): 12 | task = StoreEmbedTask( 13 | params={ 14 | "endpoint": "minio:9000", 15 | "access_key": "foo", 16 | "secret_key": "bar", 17 | } 18 | ) 19 | assert task._params["endpoint"] == "minio:9000" 20 | assert task._params["access_key"] == "foo" 21 | assert task._params["secret_key"] == "bar" 22 | 23 | 24 | # String Representation Tests 25 | 26 | 27 | def test_store_task_str_representation(): 28 | task = StoreEmbedTask(params={"endpoint": "minio:9000"}) 29 | expected_str = "Store Embed Task:\n" " endpoint: minio:9000\n" 30 | assert str(task) == expected_str 31 | 32 | 33 | # Dictionary Representation Tests 34 | 35 | 36 | @pytest.mark.parametrize( 37 | "extra_param_1, extra_param_2", 38 | [ 39 | ("foo", "bar"), 40 | ], 41 | ) 42 | def test_store_task_to_dict( 43 | extra_param_1, 44 | extra_param_2, 45 | ): 46 | task = StoreEmbedTask( 47 | params={ 48 | "extra_param_1": extra_param_1, 49 | "extra_param_2": extra_param_2, 50 | } 51 | ) 52 | 53 | expected_dict = {"type": "store_embedding", "task_properties": {"params": {}}} 54 | 55 | expected_dict["task_properties"]["params"]["extra_param_1"] = extra_param_1 56 | expected_dict["task_properties"]["params"]["extra_param_2"] = extra_param_2 57 | 58 | assert task.to_dict() == expected_dict, "The to_dict method did not return the expected dictionary representation" 59 | -------------------------------------------------------------------------------- /client/client_tests/primitives/tasks/test_task_base.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pytest 6 | from nv_ingest_client.primitives.tasks.task_base import Task 7 | from nv_ingest_client.primitives.tasks.task_base import TaskType 8 | from nv_ingest_client.primitives.tasks.task_base import is_valid_task_type 9 | 10 | # TaskType Enum Tests 11 | 12 | 13 | def test_task_type_enum_valid_values(): 14 | for task_type in TaskType: 15 | assert isinstance(task_type, TaskType), f"{task_type} should be an instance of TaskType Enum" 16 | 17 | 18 | def test_task_type_enum_invalid_value(): 19 | invalid_task_type = "INVALID" 20 | assert not is_valid_task_type( 21 | invalid_task_type 22 | ), f"'{invalid_task_type}' should not be recognized as a valid TaskType" 23 | 24 | 25 | # is_valid_task_type Function Tests 26 | 27 | 28 | @pytest.mark.parametrize("valid_task_type", [task_type.name for task_type in TaskType]) 29 | def test_is_valid_task_type_with_valid_types(valid_task_type): 30 | assert is_valid_task_type(valid_task_type), f"{valid_task_type} should be recognized as a valid TaskType" 31 | 32 | 33 | def test_is_valid_task_type_with_invalid_type(): 34 | invalid_task_type = "NON_EXISTENT_TASK" 35 | assert not is_valid_task_type( 36 | invalid_task_type 37 | ), f"{invalid_task_type} should not be recognized as a valid TaskType" 38 | 39 | 40 | # Task Class Tests 41 | 42 | 43 | def test_task_str_method(): 44 | task = Task() 45 | expected_str = f"{task.__class__.__name__}\n" 46 | assert str(task) == expected_str, "The __str__ method of Task does not return the expected string format" 47 | 48 | 49 | def test_task_to_dict_method(): 50 | task = Task() 51 | expected_dict = {} 52 | assert task.to_dict() == expected_dict, ( 53 | "The to_dict method of Task should return an empty dictionary for a " "generic task" 54 | ) 55 | -------------------------------------------------------------------------------- /client/client_tests/util/file_processing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "nv-ingest-client" 7 | description = "Python client for the nv-ingest service" 8 | dynamic = ["version"] 9 | readme = "README.md" 10 | authors = [ 11 | {name = "Jeremy Dyer", email = "jdyer@nvidia.com"} 12 | ] 13 | license = {file = "LICENSE"} 14 | requires-python = ">=3.10" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3.10", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "azure-storage-blob==12.24.0", 21 | "build>=1.2.2", 22 | "charset-normalizer>=3.4.1", 23 | "click>=8.1.8", 24 | "fsspec>=2025.2.0", 25 | "httpx==0.27.2", 26 | "langchain-milvus==0.1.7", 27 | "langchain-nvidia-ai-endpoints>=0.3.7", 28 | "llama-index-embeddings-nvidia==0.1.5", 29 | "minio>=7.2.15", 30 | "nv-ingest-api==25.4.2", 31 | "openai~=1.68.1", 32 | "pyarrow>=19.0.0", 33 | "pydantic>2.0.0", 34 | "pydantic-settings>2.0.0", 35 | "pymilvus==2.5.4", 36 | "pymilvus[bulk_writer,model]", 37 | "pypdfium2>=4.30.1", 38 | "python-docx>=1.1.2", 39 | "python-magic>=0.4.27", 40 | "python-pptx==0.6.23", 41 | "redis~=5.2.1", 42 | "requests>=2.28.2", 43 | "setuptools>=58.2.0", 44 | "tqdm>=4.67.1", 45 | ] 46 | 47 | [project.urls] 48 | homepage = "https://github.com/NVIDIA/nv-ingest" 49 | repository = "https://github.com/NVIDIA/nv-ingest" 50 | documentation = "https://docs.nvidia.com/nv-ingest" 51 | 52 | [project.scripts] 53 | nv-ingest-cli = "nv_ingest_client.nv_ingest_cli:main" 54 | process-json-files = "nv_ingest_client.util.process_json_files:main" 55 | 56 | [tool.setuptools] 57 | py-modules = ["nv_ingest_client"] 58 | 59 | [tool.setuptools.packages.find] 60 | where = ["src"] 61 | 62 | [tool.setuptools.dynamic] 63 | version = {attr = "version.get_version"} 64 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/cli/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/cli/util/tasks.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/client/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest_client.client.client import NvIngestClient 6 | from nv_ingest_client.client.interface import Ingestor 7 | 8 | __all__ = ["NvIngestClient", "Ingestor"] 9 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .jobs import BatchJobSpec 6 | from .jobs import JobSpec 7 | from .tasks import Task 8 | 9 | __all__ = ["BatchJobSpec", "JobSpec", "Task"] 10 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/exceptions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/primitives/exceptions.py -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest_client.primitives.jobs.job_spec import BatchJobSpec 6 | from nv_ingest_client.primitives.jobs.job_spec import JobSpec 7 | from nv_ingest_client.primitives.jobs.job_state import JobState 8 | from nv_ingest_client.primitives.jobs.job_state import JobStateEnum 9 | 10 | __all__ = ["BatchJobSpec", "JobSpec", "JobState", "JobStateEnum"] 11 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .audio_extraction import AudioExtractionTask 6 | from .caption import CaptionTask 7 | from .chart_extraction import ChartExtractionTask 8 | from .dedup import DedupTask 9 | from .embed import EmbedTask 10 | from .extract import ExtractTask 11 | from .filter import FilterTask 12 | from .infographic_extraction import InfographicExtractionTask 13 | from .split import SplitTask 14 | from .store import StoreTask 15 | from .store import StoreEmbedTask 16 | from .table_extraction import TableExtractionTask 17 | from .task_base import Task 18 | from .task_base import TaskType 19 | from .task_base import is_valid_task_type 20 | from .task_factory import task_factory 21 | 22 | __all__ = [ 23 | "AudioExtractionTask", 24 | "CaptionTask", 25 | "ChartExtractionTask", 26 | "ExtractTask", 27 | "is_valid_task_type", 28 | "InfographicExtractionTask", 29 | "SplitTask", 30 | "StoreEmbedTask", 31 | "StoreTask", 32 | "TableExtractionTask", 33 | "Task", 34 | "task_factory", 35 | "TaskType", 36 | "DedupTask", 37 | "FilterTask", 38 | "EmbedTask", 39 | ] 40 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/tasks/chart_extraction.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | # pylint: disable=too-few-public-methods 7 | # pylint: disable=too-many-arguments 8 | 9 | import logging 10 | from typing import Dict 11 | 12 | from pydantic import BaseModel 13 | 14 | from .task_base import Task 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class ChartExtractionSchema(BaseModel): 20 | class Config: 21 | extra = "forbid" 22 | 23 | 24 | class ChartExtractionTask(Task): 25 | """ 26 | Object for chart extraction task 27 | """ 28 | 29 | def __init__(self) -> None: 30 | """ 31 | Setup Dedup Task Config 32 | """ 33 | super().__init__() 34 | 35 | def __str__(self) -> str: 36 | """ 37 | Returns a string with the object's config and run time state 38 | """ 39 | info = "" 40 | info += "chart extraction task\n" 41 | return info 42 | 43 | def to_dict(self) -> Dict: 44 | """ 45 | Convert to a dict for submission to redis 46 | """ 47 | 48 | task_properties = { 49 | "params": {}, 50 | } 51 | 52 | return {"type": "chart_data_extract", "task_properties": task_properties} 53 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/tasks/infographic_extraction.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | # pylint: disable=too-few-public-methods 7 | # pylint: disable=too-many-arguments 8 | 9 | import logging 10 | from typing import Dict 11 | 12 | from pydantic import BaseModel 13 | 14 | from .task_base import Task 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class InfographicExtractionSchema(BaseModel): 20 | class Config: 21 | extra = "forbid" 22 | 23 | 24 | class InfographicExtractionTask(Task): 25 | """ 26 | Object for infographic extraction task 27 | """ 28 | 29 | def __init__(self) -> None: 30 | """ 31 | Setup Dedup Task Config 32 | """ 33 | super().__init__() 34 | 35 | def __str__(self) -> str: 36 | """ 37 | Returns a string with the object's config and run time state 38 | """ 39 | info = "" 40 | info += "infographic extraction task\n" 41 | return info 42 | 43 | def to_dict(self) -> Dict: 44 | """ 45 | Convert to a dict for submission to redis 46 | """ 47 | 48 | task_properties = { 49 | "params": {}, 50 | } 51 | 52 | return {"type": "infographic_data_extract", "task_properties": task_properties} 53 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/tasks/table_extraction.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | # pylint: disable=too-few-public-methods 7 | # pylint: disable=too-many-arguments 8 | 9 | import logging 10 | from typing import Dict 11 | 12 | from pydantic import BaseModel 13 | 14 | from .task_base import Task 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class TableExtractionSchema(BaseModel): 20 | class Config: 21 | extra = "forbid" 22 | 23 | 24 | class TableExtractionTask(Task): 25 | """ 26 | Object for table extraction tasks 27 | """ 28 | 29 | def __init__(self) -> None: 30 | """ 31 | Setup Dedup Task Config 32 | """ 33 | super().__init__() 34 | 35 | def __str__(self) -> str: 36 | """ 37 | Returns a string with the object's config and run time state 38 | """ 39 | info = "" 40 | info += "table extraction task\n" 41 | return info 42 | 43 | def to_dict(self) -> Dict: 44 | """ 45 | Convert to a dict for submission to redis 46 | """ 47 | 48 | task_properties = { 49 | "params": {}, 50 | } 51 | 52 | return {"type": "table_data_extract", "task_properties": task_properties} 53 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/tasks/transform.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/primitives/tasks/transform.py -------------------------------------------------------------------------------- /client/src/nv_ingest_client/primitives/tasks/vdb_upload.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | # pylint: disable=too-few-public-methods 7 | # pylint: disable=too-many-arguments 8 | 9 | import logging 10 | from typing import Dict 11 | 12 | from pydantic import BaseModel 13 | 14 | from .task_base import Task 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class VdbUploadTaskSchema(BaseModel): 20 | filter_errors: bool = False 21 | bulk_ingest: bool = False 22 | bulk_ingest_path: str = "" 23 | params: dict = None 24 | 25 | class Config: 26 | extra = "forbid" 27 | 28 | 29 | class VdbUploadTask(Task): 30 | """ 31 | Object for document embedding task 32 | """ 33 | 34 | def __init__( 35 | self, 36 | filter_errors: bool = False, 37 | bulk_ingest: bool = False, 38 | bulk_ingest_path: str = "embeddings/", 39 | params: dict = None, 40 | ) -> None: 41 | """ 42 | Setup VDB Upload Task Config 43 | """ 44 | super().__init__() 45 | self._filter_errors = filter_errors 46 | self._bulk_ingest = bulk_ingest 47 | self._bulk_ingest_path = bulk_ingest_path 48 | self._params = params or {} 49 | 50 | def __str__(self) -> str: 51 | """ 52 | Returns a string with the object's config and run time state 53 | """ 54 | info = "" 55 | info += "VDB Upload Task:\n" 56 | info += f" filter_errors: {self._filter_errors}\n" 57 | return info 58 | 59 | def to_dict(self) -> Dict: 60 | """ 61 | Convert to a dict for submission to redis 62 | """ 63 | 64 | task_properties = { 65 | "filter_errors": self._filter_errors, 66 | "bulk_ingest": self._bulk_ingest, 67 | "bulk_ingest_path": self._bulk_ingest_path, 68 | "params": self._params, 69 | } 70 | 71 | return {"type": "vdb_upload", "task_properties": task_properties} 72 | -------------------------------------------------------------------------------- /client/src/nv_ingest_client/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/util/__init__.py -------------------------------------------------------------------------------- /client/src/nv_ingest_client/util/file_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/client/src/nv_ingest_client/util/file_processing/__init__.py -------------------------------------------------------------------------------- /client/src/version.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import datetime 7 | import os 8 | import re 9 | 10 | 11 | def get_version(): 12 | release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev") 13 | version = os.getenv("NV_INGEST_VERSION") 14 | rev = os.getenv("NV_INGEST_REV", "0") 15 | 16 | if not version: 17 | version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}" 18 | 19 | # We only check this for dev, we assume for release the user knows what they are doing 20 | if release_type != "release": 21 | # Ensure the version is PEP 440 compatible 22 | pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$" 23 | if not re.match(pep440_regex, version): 24 | raise ValueError(f"Version '{version}' is not PEP 440 compatible") 25 | 26 | # Construct the final version string 27 | if release_type == "dev": 28 | # If rev is not specified and defaults to 0 lets create a more meaningful development 29 | # identifier that is pep440 compliant 30 | if int(rev) == 0: 31 | rev = datetime.datetime.now().strftime("%Y%m%d") 32 | final_version = f"{version}.dev{rev}" 33 | elif release_type == "release": 34 | final_version = f"{version}.post{rev}" if int(rev) > 0 else version 35 | else: 36 | raise ValueError(f"Invalid release type: {release_type}") 37 | 38 | return final_version 39 | -------------------------------------------------------------------------------- /conda/environments/nv_ingest_api_environment.yml: -------------------------------------------------------------------------------- 1 | name: nv_ingest_api 2 | channels: 3 | - nvidia/label/dev 4 | - rapidsai 5 | - nvidia 6 | - conda-forge 7 | - pytorch 8 | dependencies: 9 | - diskcache>=5.6.3 10 | - pydantic>2.0.0 11 | - pydantic-settings>2.0.0 12 | - pytest>=8.0.2 13 | - pytest-mock>=3.14.0 14 | - pytest-cov>=6.0.0 15 | - python>=3.10 16 | - python-build>=1.2.2 17 | - setuptools>=58.2.0 18 | - pip 19 | -------------------------------------------------------------------------------- /conda/environments/nv_ingest_client_environment.yml: -------------------------------------------------------------------------------- 1 | name: nv_ingest_client 2 | channels: 3 | - nvidia/label/dev 4 | - rapidsai 5 | - nvidia 6 | - conda-forge 7 | dependencies: 8 | - click>=8.1.7 9 | - diskcache>=5.6.3 10 | - fsspec>=2024.10.0 11 | - httpx>=0.28.1 12 | - pydantic>=2.10.3 13 | - pypdfium2>=4.30.0 14 | - pytest>=8.0.2 15 | - pytest-mock>=3.14.0 16 | - pytest-cov>=6.0.0 17 | - python>=3.10 18 | - python-build>=1.2.2 19 | - python-docx>=1.1.2 20 | - python-pptx>=1.0.2 21 | - requests>=2.28.2 22 | - setuptools>=58.2.0 23 | - tqdm>=4.67.1 24 | - pip 25 | -------------------------------------------------------------------------------- /conda/environments/nv_ingest_environment.yml: -------------------------------------------------------------------------------- 1 | name: nv_ingest_runtime 2 | channels: 3 | - nvidia/label/dev 4 | - rapidsai 5 | - nvidia 6 | - conda-forge 7 | - pytorch 8 | dependencies: 9 | - azure-core>=1.32.0 10 | - click>=8.1.7 11 | - diskcache>=5.6.3 12 | - fastapi>=0.115.6 13 | - fastparquet>=2024.11.0 14 | - fsspec>=2024.10.0 15 | - gunicorn 16 | - h11>=0.16.0 # Must pin at or above 0.16.0 for CVE mitigation 17 | - httpx>=0.28.1 18 | - isodate>=0.7.2 19 | - langdetect>=1.0.9 20 | - minio>=7.2.12 21 | - morpheus-core=25.02 22 | - morpheus-llm=25.02 23 | - openai>=1.57.1 24 | - opentelemetry-api>=1.27.0 25 | - opentelemetry-exporter-otlp>=1.27.0 26 | - opentelemetry-sdk>=1.27.0 27 | - pydantic>2.0.0 28 | - pydantic-settings>2.0.0 29 | - pypdfium2>=4.30.0 30 | - pytest>=8.0.2 31 | - pytest-mock>=3.14.0 32 | - pytest-cov>=6.0.0 33 | - python>=3.10 34 | - python-build>=1.2.2 35 | - python-docx>=1.1.2 36 | - python-dotenv>=1.0.1 37 | - python-pptx>=1.0.2 38 | - pytorch 39 | - redis-py>=5.2.1 40 | - requests>=2.28.2 41 | - scipy>=1.15.1 42 | - setuptools>=58.2.0 43 | - tabulate>=0.9.0 44 | - torchvision 45 | - torchaudio 46 | - transformers>=4.47.0 47 | - tqdm>=4.67.1 48 | - uvicorn 49 | - pip 50 | - pip: 51 | - llama-index-embeddings-nvidia 52 | - opencv-python # For some reason conda cant solve our req set with py-opencv so we need to use pip 53 | - pymilvus>=2.5.0 54 | - pymilvus[bulk_writer, model] 55 | - nvidia-riva-client>=2.18.0 56 | - unstructured-client 57 | -------------------------------------------------------------------------------- /conda/packages/nv_ingest_api/meta.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | {% set py_version = environ['CONDA_PY'] %} 6 | {% set GIT_SHA = environ['GIT_SHA'] %} 7 | 8 | # Determine Git root, falling back to default path ../../.. if Git is not available or the directory is not a Git repo 9 | {% set git_root = environ.get('GIT_ROOT', '../../../api') %} 10 | 11 | package: 12 | name: nv_ingest_api 13 | version: {{ environ.get('NV_INGEST_API_VERSION', 'Unknown') }} 14 | 15 | source: 16 | path: {{ git_root }} 17 | 18 | build: 19 | number: 0 20 | string: py{{ py_version }}_{{ GIT_SHA }} 21 | script: 22 | - {{ PYTHON }} -m pip install . --no-deps -vv 23 | 24 | requirements: 25 | build: 26 | - pip 27 | - python=3.10 28 | - setuptools>=58.2.0 29 | run: 30 | - azure-core>=1.32.0 31 | - fastparquet>=2024.11.0 32 | - fsspec>=2024.10.0 33 | - httpx>=0.28.1 34 | - isodate>=0.7.2 35 | - langdetect>=1.0.9 36 | - openai>=1.57.1 37 | - pydantic>=2.0.0 38 | - pypdfium2>=4.30.0 39 | - pytest>=8.0.2 40 | - pytest-mock>=3.14.0 41 | - python>=3.10 42 | - python-docx>=1.1.2 43 | - python-dotenv>=1.0.1 44 | - python-magic>=0.4.27 45 | - python-pptx>=1.0.2 46 | - pytorch 47 | - requests>=2.28.2 48 | - setuptools>=58.2.0 49 | - tabulate>=0.9.0 50 | - torchaudio 51 | - torchvision 52 | - transformers>=4.47.0 53 | # - unstructured-client>=0.25.9 54 | 55 | test: 56 | commands: 57 | - pytest ./tests 58 | 59 | about: 60 | home: "https://github.com/NVIDIA/nv-ingest" 61 | license: "Apache-2.0" 62 | summary: "Python module with core document ingestion functions." 63 | description: "Python module with core document ingestion functions." 64 | 65 | extra: 66 | recipe-maintainers: 67 | - jdyer@nvidia.com 68 | 69 | channels: 70 | - rapidsai 71 | - nvidia 72 | - conda-forge 73 | - pytorch 74 | -------------------------------------------------------------------------------- /conda/packages/nv_ingest_client/meta.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | {% set py_version = environ['CONDA_PY'] %} 6 | {% set GIT_SHA = environ['GIT_SHA'] %} 7 | 8 | # Determine Git root, falling back to default path ../../.. if Git is not available or the directory is not a Git repo 9 | {% set git_root = environ.get('GIT_ROOT', '../../../client') %} 10 | 11 | package: 12 | name: nv_ingest_client 13 | version: {{ environ.get('NV_INGEST_CLIENT_VERSION', 'Unknown') }} 14 | 15 | source: 16 | path: {{ git_root }} 17 | 18 | build: 19 | number: 0 20 | string: py{{ py_version }}_{{ GIT_SHA }} 21 | script: 22 | - {{ PYTHON }} -m pip install . --no-deps -vv 23 | 24 | requirements: 25 | build: 26 | - pip 27 | - python=3.10 28 | - setuptools>=58.2.0 29 | run: 30 | - click>=8.1.7 31 | - fsspec>=2024.10.0 32 | - httpx>=0.28.1 33 | - pydantic>=2.0.0 34 | - pypdfium2>=4.30.0 35 | - python>=3.10 36 | - python-docx>=1.1.2 37 | - python-pptx>=1.0.2 38 | - requests>=2.28.2 39 | - setuptools>=58.2.0 40 | - tqdm>=4.67.1 41 | 42 | test: 43 | commands: 44 | - pytest ./tests 45 | 46 | about: 47 | home: "https://github.com/NVIDIA/nv-ingest" 48 | license: "Apache-2.0" 49 | summary: "Python module supporting document ingestion." 50 | description: "Python module supporting document ingestion." 51 | 52 | extra: 53 | recipe-maintainers: 54 | - drobison@nvidia.com 55 | 56 | channels: 57 | - rapidsai 58 | - nvidia 59 | - conda-forge 60 | -------------------------------------------------------------------------------- /conda/scripts/helper_functions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | # Fail on errors and undefined variables 7 | set -euo pipefail 8 | 9 | validate_conda_build_environment() { 10 | ############################## 11 | # Validate Dependencies 12 | ############################## 13 | 14 | # Ensure conda is installed 15 | if ! command -v conda &> /dev/null; then 16 | echo "Error: conda not found in PATH. Please ensure Conda is installed and available." 17 | exit 1 18 | fi 19 | 20 | # Ensure conda-build is installed 21 | if ! command -v conda-build &> /dev/null; then 22 | echo "Error: conda-build not found in PATH. Install it via: conda install conda-build" 23 | exit 1 24 | fi 25 | 26 | # Ensure git is installed 27 | if ! command -v git &> /dev/null; then 28 | echo "Error: git not found in PATH. Please ensure Git is installed and available." 29 | exit 1 30 | fi 31 | } 32 | 33 | determine_git_root() { 34 | ############################## 35 | # Determine Git Root 36 | ############################## 37 | 38 | if git rev-parse --is-inside-work-tree &> /dev/null; then 39 | echo "$(git rev-parse --show-toplevel)" 40 | else 41 | echo "Error: Not inside a Git repository. Unable to determine the Git root." 42 | exit 1 43 | fi 44 | } 45 | -------------------------------------------------------------------------------- /config/otel-collector-config.yaml: -------------------------------------------------------------------------------- 1 | 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | receivers: 7 | otlp: 8 | protocols: 9 | grpc: 10 | endpoint: 0.0.0.0:4317 11 | http: 12 | endpoint: 0.0.0.0:4318 13 | 14 | exporters: 15 | # NOTE: Prior to v0.86.0 use `logging` instead of `debug`. 16 | zipkin: 17 | endpoint: "http://zipkin:9411/api/v2/spans" 18 | logging: 19 | verbosity: detailed 20 | prometheus: 21 | endpoint: "0.0.0.0:8889" 22 | 23 | processors: 24 | batch: 25 | tail_sampling: 26 | policies: [ 27 | { 28 | name: filter_http_url, 29 | type: string_attribute, 30 | string_attribute: { 31 | key: http.route, 32 | values: [ "/health/ready" ], 33 | enabled_regex_matching: true, 34 | invert_match: true 35 | } 36 | } 37 | ] 38 | 39 | extensions: 40 | health_check: 41 | zpages: 42 | 43 | service: 44 | extensions: [zpages, health_check] 45 | telemetry: 46 | logs: 47 | level: "debug" 48 | pipelines: 49 | traces: 50 | receivers: [otlp] 51 | processors: [batch, tail_sampling] 52 | exporters: [zipkin, logging] 53 | metrics: 54 | receivers: [otlp] 55 | processors: [batch] 56 | exporters: [prometheus, logging] 57 | logs: 58 | receivers: [otlp] 59 | processors: [batch] 60 | exporters: [logging] 61 | -------------------------------------------------------------------------------- /config/prometheus.yaml: -------------------------------------------------------------------------------- 1 | 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | scrape_configs: 7 | - job_name: "otel-collector" 8 | scrape_interval: 5s 9 | static_configs: 10 | - targets: ["otel-collector:8889"] 11 | - targets: ["otel-collector:9988"] 12 | -------------------------------------------------------------------------------- /data/chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/chart.png -------------------------------------------------------------------------------- /data/embedded_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/embedded_table.pdf -------------------------------------------------------------------------------- /data/functional_validation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/functional_validation.pdf -------------------------------------------------------------------------------- /data/multimodal_test.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.bmp -------------------------------------------------------------------------------- /data/multimodal_test.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.docx -------------------------------------------------------------------------------- /data/multimodal_test.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.jpeg -------------------------------------------------------------------------------- /data/multimodal_test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.pdf -------------------------------------------------------------------------------- /data/multimodal_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.png -------------------------------------------------------------------------------- /data/multimodal_test.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.pptx -------------------------------------------------------------------------------- /data/multimodal_test.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.tiff -------------------------------------------------------------------------------- /data/multimodal_test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/multimodal_test.wav -------------------------------------------------------------------------------- /data/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/table.png -------------------------------------------------------------------------------- /data/table_test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/table_test.pdf -------------------------------------------------------------------------------- /data/test-page-form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/test-page-form.pdf -------------------------------------------------------------------------------- /data/test-shapes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/test-shapes.pdf -------------------------------------------------------------------------------- /data/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/test.pdf -------------------------------------------------------------------------------- /data/woods_frost.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/woods_frost.docx -------------------------------------------------------------------------------- /data/woods_frost.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/data/woods_frost.pdf -------------------------------------------------------------------------------- /docker/scripts/entrypoint_devcontainer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash --login 2 | # SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | #!/bin/bash 19 | 20 | # Activate the `nv_ingest_runtime` conda environment 21 | . /opt/conda/etc/profile.d/conda.sh 22 | conda activate nv_ingest_runtime 23 | 24 | # Source "source" file if it exists 25 | SRC_FILE="/opt/docker/bin/entrypoint_source" 26 | [ -f "${SRC_FILE}" ] && source "${SRC_FILE}" 27 | 28 | # Check if user supplied a command 29 | if [ "$#" -gt 0 ]; then 30 | # If a command is provided, run it 31 | exec "$@" 32 | else 33 | # If no command is provided, run the default startup launch 34 | if [ "${MESSAGE_CLIENT_TYPE}" != "simple" ]; then 35 | # Start uvicorn if MESSAGE_CLIENT_TYPE is not 'simple' 36 | uvicorn nv_ingest.api.main:app --workers 1 --host 0.0.0.0 --port 7670 --reload --app-dir /workspace/src/nv_ingest & 37 | fi 38 | 39 | python /workspace/src/microservice_entrypoint.py 40 | fi 41 | -------------------------------------------------------------------------------- /docker/scripts/entrypoint_source_ext.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | set -e 7 | 8 | # Run preparation tasks here 9 | 10 | if [ "$INSTALL_ADOBE_SDK" = "true" ]; then 11 | echo "Checking if Adobe PDF Services SDK is installed..." 12 | 13 | # Check if pdfservices-sdk is installed 14 | if ! python -c "import pkg_resources; pkg_resources.require('pdfservices-sdk~=4.0.0')" 2>/dev/null; then 15 | echo "Installing Adobe PDF Services SDK..." 16 | pip install "pdfservices-sdk~=4.0.0" 17 | fi 18 | fi 19 | 20 | # Check if audio dependencies should be installed 21 | if [ "$INSTALL_AUDIO_EXTRACTION_DEPS" = "true" ]; then 22 | echo "Checking if librosa is installed..." 23 | 24 | # Check if librosa is installed 25 | if ! python -c "import pkg_resources; pkg_resources.require('librosa')" 2>/dev/null; then 26 | echo "Installing librosa using conda..." 27 | mamba install -y -c conda-forge librosa 28 | fi 29 | fi 30 | 31 | # If MEM_TRACE is set in the environment, use mamba to install memray 32 | if [ -n "$MEM_TRACE" ]; then 33 | echo "MEM_TRACE is set. Installing memray via mamba..." 34 | mamba install -y conda-forge::memray 35 | fi 36 | -------------------------------------------------------------------------------- /docker/scripts/post_build_triggers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from transformers import AutoTokenizer 3 | 4 | if os.getenv("DOWNLOAD_LLAMA_TOKENIZER") == "True": 5 | tokenizer_path = os.path.join(os.environ.get("MODEL_PREDOWNLOAD_PATH"), "llama-3.2-1b/tokenizer/") 6 | os.makedirs(tokenizer_path) 7 | 8 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=os.getenv("HF_ACCESS_TOKEN")) 9 | tokenizer.save_pretrained(tokenizer_path) 10 | else: 11 | tokenizer_path = os.path.join(os.environ.get("MODEL_PREDOWNLOAD_PATH"), "e5-large-unsupervised/tokenizer/") 12 | os.makedirs(tokenizer_path) 13 | 14 | tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-unsupervised") 15 | tokenizer.save_pretrained(tokenizer_path) 16 | -------------------------------------------------------------------------------- /docs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM squidfunk/mkdocs-material:latest 2 | 3 | # Install plugins. 4 | RUN apk add gcc python3-dev musl-dev linux-headers 5 | COPY requirements.txt /tmp/ 6 | RUN pip install --disable-pip-version-check --no-cache-dir -r /tmp/requirements.txt 7 | -------------------------------------------------------------------------------- /docs/docs/assets/css/fonts.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: "NVIDIA Sans"; 3 | font-style: normal; 4 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Lt.woff2); 5 | font-weight: light; 6 | } 7 | 8 | @font-face { 9 | font-family: "NVIDIA Sans"; 10 | font-style: italic; 11 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_LtIt.woff2); 12 | font-weight: light; 13 | } 14 | 15 | @font-face { 16 | font-family: "NVIDIA Sans"; 17 | font-style: normal; 18 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Rg.woff2); 19 | font-weight: normal; 20 | } 21 | 22 | @font-face { 23 | font-family: "NVIDIA Sans"; 24 | font-style: italic; 25 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_It.woff2); 26 | font-weight: normal; 27 | } 28 | 29 | @font-face { 30 | font-family: "NVIDIA Sans"; 31 | font-style: normal; 32 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Bd.woff2); 33 | font-weight: bold; 34 | } 35 | 36 | @font-face { 37 | font-family: "NVIDIA Sans"; 38 | font-style: italic; 39 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_BdIt.woff2); 40 | font-weight: bold; 41 | } 42 | -------------------------------------------------------------------------------- /docs/docs/assets/css/jupyter-themes.css: -------------------------------------------------------------------------------- 1 | /* theme: light */ 2 | body[data-md-color-scheme="light"] .jupyter-notebook { 3 | --jp-cell-editor-background: #f7f7f7; 4 | --jp-cell-editor-border-color: #cfcfcf; 5 | --jp-cell-prompt-fg-color: #303030; 6 | --jp-cell-prompt-bg-color: #f0f0f0; 7 | --jp-notebook-background: #ffffff; 8 | --jp-layout-color1: #ffffff; 9 | --jp-content-font-color1: #000000; 10 | } 11 | 12 | /* theme: dark */ 13 | body[data-md-color-scheme="dark"] .jupyter-notebook { 14 | --jp-cell-editor-background: #2b2b2b; 15 | --jp-cell-editor-border-color: #464646; 16 | --jp-cell-prompt-fg-color: #d7d7d7; 17 | --jp-cell-prompt-bg-color: #333333; 18 | --jp-notebook-background: #1e1e1e; 19 | --jp-layout-color1: #1e1e1e; 20 | --jp-content-font-color1: #d4d4d4; 21 | } 22 | -------------------------------------------------------------------------------- /docs/docs/extraction/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to NV-Ingest 2 | 3 | External contributions to NV-Ingest will be welcome soon, and they are greatly appreciated! 4 | For more information, refer to [Contributing to NV-Ingest](https://github.com/NVIDIA/nv-ingest/blob/main/CONTRIBUTING.md). 5 | -------------------------------------------------------------------------------- /docs/docs/extraction/example_processed_docs/text/multimodal_test.pdf.metadata.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/example_processed_docs/text/multimodal_test.pdf.metadata.json -------------------------------------------------------------------------------- /docs/docs/extraction/helm.md: -------------------------------------------------------------------------------- 1 | # Deploy With Helm for NeMo Retriever Extraction 2 | 3 | 4 | 5 | To deploy [NeMo Retriever extraction](overview.md) by using Helm, 6 | refer to [NV-Ingest Helm Charts](https://github.com/NVIDIA/nv-ingest/tree/main/helm). 7 | -------------------------------------------------------------------------------- /docs/docs/extraction/images/audio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/audio.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/generate_personal_key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/generate_personal_key.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/image_viewer_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/image_viewer_example.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/overview-extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/overview-extraction.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/overview-retriever.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/overview-retriever.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/preview-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/preview-image.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/prometheus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/prometheus.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/test.pdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/test.pdf.png -------------------------------------------------------------------------------- /docs/docs/extraction/images/zipkin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/docs/docs/extraction/images/zipkin.png -------------------------------------------------------------------------------- /docs/docs/extraction/ngc-api-key.md: -------------------------------------------------------------------------------- 1 | # Generate Your NGC Keys 2 | 3 | NGC contains many public images, models, and datasets that can be pulled immediately without authentication. 4 | To push and pull custom images, you must generate a key and authenticate with NGC. 5 | 6 | To create a key, go to [https://org.ngc.nvidia.com/setup/api-keys](https://org.ngc.nvidia.com/setup/api-keys). 7 | 8 | When you create an NGC key, select the following for **Services Included**. 9 | 10 | - **NGC Catalog** 11 | - **Public API Endpoints** 12 | 13 | !!! important 14 | 15 | Early Access participants must also select **Private Registry**. 16 | 17 | ![Generate Personal Key](images/generate_personal_key.png) 18 | 19 | 20 | ## Docker Login to NGC 21 | 22 | To pull the NIM container image from NGC, use your key to log in to the NGC registry by entering the following command and then following the prompts. 23 | For the username, enter `$oauthtoken` exactly as shown. 24 | It is a special authentication key for all users. 25 | 26 | 27 | ```shell 28 | $ docker login nvcr.io 29 | Username: $oauthtoken 30 | Password: 31 | ``` 32 | -------------------------------------------------------------------------------- /docs/docs/extraction/notebooks.md: -------------------------------------------------------------------------------- 1 | # Notebooks for NeMo Retriever Extraction 2 | 3 | To get started using [NeMo Retriever extraction](overview.md), you can try one of the ready-made notebooks that are available. 4 | 5 | !!! note 6 | 7 | NeMo Retriever extraction is also known as NVIDIA Ingest and nv-ingest. 8 | 9 | 10 | To get started with the basics, try one of the following notebooks: 11 | 12 | - [NV-Ingest: CLI Client Quick Start Guide](https://github.com/NVIDIA/nv-ingest/blob/main/client/client_examples/examples/cli_client_usage.ipynb) 13 | - [NV-Ingest: Python Client Quick Start Guide](https://github.com/NVIDIA/nv-ingest/blob/main/client/client_examples/examples/python_client_usage.ipynb) 14 | 15 | For more advanced scenarios, try one of the following notebooks: 16 | 17 | - [Try out the NVIDIA Multimodal PDF Data Extraction Blueprint](https://github.com/NVIDIA/nv-ingest/blob/main/deploy/pdf-blueprint.ipynb) 18 | - [Evaluate bo767 retrieval recall accuracy with NV-Ingest and Milvus](https://github.com/NVIDIA/nv-ingest/blob/main/evaluation/bo767_recall.ipynb) 19 | - [Multimodal RAG with LangChain](https://github.com/NVIDIA/nv-ingest/blob/main/examples/langchain_multimodal_rag.ipynb) 20 | - [Multimodal RAG with LlamaIndex](https://github.com/NVIDIA/nv-ingest/blob/main/examples/llama_index_multimodal_rag.ipynb) 21 | 22 | 23 | 24 | ## Related Topics 25 | 26 | - [Prerequisites](prerequisites.md) 27 | - [Support Matrix](support-matrix.md) 28 | - [Deploy Without Containers (Library Mode)](quickstart-library-mode.md) 29 | - [Deploy With Docker Compose (Self-Hosted)](quickstart-guide.md) 30 | - [Deploy With Helm](helm.md) 31 | -------------------------------------------------------------------------------- /docs/docs/extraction/prerequisites.md: -------------------------------------------------------------------------------- 1 | # Prerequisites for NeMo Retriever Extraction 2 | 3 | Before you begin using [NeMo Retriever extraction](overview.md), ensure the following software prerequisites are met. 4 | 5 | 6 | ## Software 7 | 8 | - Linux operating systems (Ubuntu 22.04 or later recommended) 9 | - [Docker](https://docs.docker.com/engine/install/) 10 | - [Docker Compose](https://docs.docker.com/compose/install/) 11 | - [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) (NVIDIA Driver >= `535`, CUDA >= `12.2`) 12 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) 13 | - [Conda Python environment and package manager](https://github.com/conda-forge/miniforge) 14 | 15 | 16 | !!! note 17 | 18 | You install Python later. NV-Ingest only supports [Python version 3.10](https://www.python.org/downloads/release/python-3100/). 19 | 20 | 21 | ## Related Topics 22 | 23 | - [Support Matrix](support-matrix.md) 24 | - [Deploy Without Containers (Library Mode)](quickstart-library-mode.md) 25 | - [Deploy With Docker Compose (Self-Hosted)](quickstart-guide.md) 26 | - [Deploy With Helm](helm.md) 27 | -------------------------------------------------------------------------------- /docs/docs/extraction/releasenotes-nv-ingest.md: -------------------------------------------------------------------------------- 1 | # Release Notes for NeMo Retriever Extraction 2 | 3 | This documentation contains the release notes for [NeMo Retriever extraction](overview.md). 4 | 5 | !!! note 6 | 7 | NeMo Retriever extraction is also known as NVIDIA Ingest and nv-ingest. 8 | 9 | ## Release 25.03 10 | 11 | ### Summary 12 | 13 | The NeMo Retriever extraction 25.03 release includes accuracy improvements, feature expansions, and throughput improvements. 14 | 15 | ## New Features 16 | 17 | - Consolidated NeMo Retriever extraction to run on a single GPU (H100, A100, L40S, or A10G). For details, refer to [Support Matrix](support-matrix.md). 18 | - Added Library Mode for a lightweight no-GPU deployment that uses NIM endpoints hosted on build.nvidia.com. For details, refer to [Deploy Without Containers (Library Mode)](quickstart-library-mode.md). 19 | - Added support for infographics extraction. 20 | - Added support for RIVA NIM for Audio extraction (Early Access). For details, refer to [Audio Processing](audio.md). 21 | - Added support for Llama-3.2 VLM for Image Captioning capability. 22 | - docX, pptx, jpg, png support for image detection & extraction. 23 | - Deprecated DePlot and CACHED NIMs. 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | ## Release 24.12.1 34 | 35 | ### Bug fixes 36 | 37 | Cases where .split() tasks fail during ingestion are now fixed. 38 | 39 | 40 | ## Release 24.12 41 | 42 | ### Known Issues 43 | 44 | We currently do not support OCR-based text extraction. This was discovered in an unsupported use case and is not a product functionality issue. 45 | -------------------------------------------------------------------------------- /docs/docs/extraction/telemetry.md: -------------------------------------------------------------------------------- 1 | # Telemetry with NeMo Retriever Extraction 2 | 3 | You can view telemetry data for [NeMo Retriever extraction](overview.md). 4 | 5 | !!! note 6 | 7 | NeMo Retriever extraction is also known as NVIDIA Ingest and nv-ingest. 8 | 9 | 10 | ## OpenTelemetry 11 | 12 | After OpenTelemetry and Zipkin are running, you can open your browser to explore traces: 13 | 14 | - **Docker** — Use http://$YOUR_DOCKER_HOST:9411/zipkin/ 15 | - **Kubernetes** — Use http://$YOUR_K8S_OTEL_POD:9411/zipkin/ 16 | 17 | ![](images/zipkin.png) 18 | 19 | ## Prometheus 20 | 21 | After Prometheus is running, you can open your browser to explore metrics: 22 | 23 | - **Docker** — Use http://$YOUR_DOCKER_HOST:9090/ziplin/ 24 | - **Kubernetes** — Use http://$YOUR_K8S_OTEL_POD:9090/zipkin/ 25 | 26 | ![](images/prometheus.png) 27 | -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | 4 | 5 | {% block extrahead %} 6 | 7 | 8 | 9 | {% endblock %} 10 | 11 | 12 | 13 | {% block footer %} 14 | 15 | {{ super() }} 16 | 17 | 18 | 19 | {% endblock %} 20 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs-material 2 | mkdocs-macros-plugin 3 | mkdocs-minify-plugin 4 | mkdocstrings[python] 5 | mkdocs-gen-files 6 | pymdown-extensions 7 | mkdocs-jupyter 8 | mkdocs-include-dir-to-nav 9 | mkdocs-literate-nav 10 | mkdocs-site-urls 11 | mkdocs-redirects 12 | myst-parser 13 | nvidia-sphinx-theme 14 | sphinx 15 | sphinx-markdown-builder 16 | sphinx-rtd-theme 17 | swagger-plugin-for-sphinx 18 | -------------------------------------------------------------------------------- /docs/scripts/generate_openapi_docs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import yaml 3 | import click 4 | import os 5 | 6 | from nv_ingest.api.main import app 7 | 8 | 9 | @click.command() 10 | @click.option("--output", default="openapi.yaml", help="Path to OpenAPI output file (default: openapi.json)") 11 | def write_openapi_schema(output): 12 | if os.path.isdir(output): 13 | print(f"Warning: '{output}' is a directory. Defaulting to '{output}/openapi.yaml'.") 14 | output = os.path.join(output, "openapi.yaml") 15 | 16 | # Determine format based on file extension 17 | if output.endswith(".yaml") or output.endswith(".yml"): 18 | with open(output, "w") as f: 19 | yaml.dump(app.openapi(), f, default_flow_style=False) 20 | print(f"OpenAPI YAML written to: {output}") 21 | else: 22 | with open(output, "w") as f: 23 | json.dump(app.openapi(), f, indent=4) 24 | print(f"OpenAPI JSON written to: {output}") 25 | 26 | 27 | if __name__ == "__main__": 28 | write_openapi_schema() 29 | -------------------------------------------------------------------------------- /docs/sphinx_docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import os 6 | import sys 7 | 8 | sys.path.insert(0, os.path.abspath("../../../api/src")) # nv-ingest-api src 9 | sys.path.insert(1, os.path.abspath("../../../client/src")) # nv-ingest-client src 10 | sys.path.insert(2, os.path.abspath("../../../src")) # nv-ingest src 11 | 12 | project = "nv-ingest" 13 | copyright = "2025, Nvidia" 14 | author = "Nvidia" 15 | release = "24.12" 16 | 17 | # -- General configuration --------------------------------------------------- 18 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 19 | 20 | extensions = [ 21 | "myst_parser", 22 | "sphinx.ext.autodoc", 23 | "sphinx.ext.autosummary", 24 | "sphinx.ext.napoleon", 25 | "sphinx.ext.viewcode", 26 | "swagger_plugin_for_sphinx", 27 | ] 28 | 29 | templates_path = ["_templates"] 30 | exclude_patterns = [] 31 | 32 | 33 | # -- Options for HTML output ------------------------------------------------- 34 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 35 | 36 | html_theme = "nvidia_sphinx_theme" 37 | 38 | html_theme_options = { 39 | "header_links": [ 40 | ("Home", "index"), 41 | ("GitHub", "https://github.com/NVIDIA/nvidia-sphinx-theme", True, "fab fa-github"), 42 | ], 43 | "footer_links": [ 44 | ("Privacy Policy", "https://www.nvidia.com/en-us/about-nvidia/privacy-policy/"), 45 | ("Terms of Use", "https://www.nvidia.com/en-us/about-nvidia/legal-info/"), 46 | ], 47 | "show_prev_next": True, # Show next/previous buttons at bottom 48 | } 49 | 50 | html_static_path = ["_static"] 51 | -------------------------------------------------------------------------------- /docs/sphinx_docs/source/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | API reference 3 | =============== 4 | 5 | Provides API references for the `nv-ingest-api`, `nv-ingest-client`, and `nv-ingest` modules. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: NV-Ingest Packages 10 | 11 | nv-ingest-api/modules.rst 12 | nv-ingest-client/modules.rst 13 | nv-ingest/modules.rst 14 | 15 | -------------------------------------------------------------------------------- /docs/sphinx_docs/source/openapi.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | NV-Ingest OpenAPI reference 3 | ================================== 4 | 5 | .. swagger-plugin:: openapi.yaml 6 | :id: nv-ingest-openapi 7 | :page-title: NV-Ingest OpenAPI Reference 8 | -------------------------------------------------------------------------------- /examples/launch_libmode_service.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import logging 6 | import os 7 | import sys 8 | 9 | from nv_ingest.framework.orchestration.morpheus.util.pipeline.pipeline_runners import ( 10 | PipelineCreationSchema, 11 | start_pipeline_subprocess, 12 | ) 13 | from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging 14 | 15 | # Configure the logger 16 | logger = logging.getLogger(__name__) 17 | 18 | local_log_level = os.getenv("INGEST_LOG_LEVEL", "INFO") 19 | if local_log_level in ("DEFAULT",): 20 | local_log_level = "INFO" 21 | 22 | configure_local_logging(logger, local_log_level) 23 | 24 | 25 | def main(): 26 | try: 27 | # Possibly override config parameters 28 | config_data = {} 29 | 30 | # Filter out None values to let the schema defaults handle them 31 | config_data = {key: value for key, value in config_data.items() if value is not None} 32 | 33 | # Construct the pipeline configuration 34 | config = PipelineCreationSchema(**config_data) 35 | 36 | # Start the pipeline subprocess 37 | pipeline_process = start_pipeline_subprocess(config, stderr=sys.stderr, stdout=sys.stdout) 38 | 39 | pipeline_process.wait() 40 | 41 | # The main program will exit, and the atexit handler will terminate the subprocess group 42 | 43 | except Exception as e: 44 | logger.error(f"Error running pipeline subprocess or ingestion: {e}") 45 | 46 | # The atexit handler will ensure subprocess termination 47 | sys.exit(1) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | 25 | # Ignore shell scripts 26 | update_helm_readme.sh 27 | 28 | # Ignore temporary files 29 | *.tmp 30 | *.prefix 31 | *.suffix 32 | 33 | # Ignore editor files 34 | *.swp 35 | *.swo 36 | *~ 37 | 38 | # Ignore OS files 39 | .DS_Store 40 | Thumbs.db 41 | 42 | # Ignore time-slicing directory 43 | time-slicing/ 44 | 45 | # Ignore helm-docs template file 46 | README.md.gotmpl 47 | -------------------------------------------------------------------------------- /helm/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | # CHANGELOG 3 | 4 | ## v0.2.24 - 15 Aug 2024 5 | 6 | Update the handling of the default for the persistent volume name 7 | 8 | ## v0.2.23 - 15 Aug 2024 9 | 10 | Update for otel env vars 11 | -------------------------------------------------------------------------------- /helm/Chart.lock: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: nvidia-nim-nemoretriever-page-elements-v2 3 | repository: https://helm.ngc.nvidia.com/nim/nvidia 4 | version: 1.2.0 5 | - name: nvidia-nim-nemoretriever-graphic-elements-v1 6 | repository: https://helm.ngc.nvidia.com/nim/nvidia 7 | version: 1.2.0 8 | - name: nvidia-nim-nemoretriever-table-structure-v1 9 | repository: https://helm.ngc.nvidia.com/nim/nvidia 10 | version: 1.2.0 11 | - name: nim-vlm 12 | repository: https://helm.ngc.nvidia.com/nvidia/nemo-microservices 13 | version: 1.2.0-ea-v2 14 | - name: nvidia-nim-paddleocr 15 | repository: https://helm.ngc.nvidia.com/nim/baidu 16 | version: 1.2.0 17 | - name: nvidia-nim-nv-embedqa-e5-v5 18 | repository: https://helm.ngc.nvidia.com/nim/nvidia 19 | version: 1.5.0 20 | - name: nvidia-nim-llama-32-nv-embedqa-1b-v2 21 | repository: https://helm.ngc.nvidia.com/nim/nvidia 22 | version: 1.5.0 23 | - name: riva-nim 24 | repository: https://helm.ngc.nvidia.com/nim/nvidia 25 | version: 1.0.0 26 | - name: milvus 27 | repository: https://zilliztech.github.io/milvus-helm 28 | version: 4.1.11 29 | - name: redis 30 | repository: oci://registry-1.docker.io/bitnamicharts 31 | version: 19.1.3 32 | - name: zipkin 33 | repository: https://zipkin.io/zipkin-helm 34 | version: 0.1.2 35 | - name: opentelemetry-collector 36 | repository: https://open-telemetry.github.io/opentelemetry-helm-charts 37 | version: 0.78.1 38 | digest: sha256:7675e65058740aa9ab90e4f3b458f226bd2dd9a992a3ea7353dd2de6f732a26f 39 | generated: "2025-05-01T10:32:44.178383534-04:00" 40 | -------------------------------------------------------------------------------- /helm/LICENSE: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /helm/mig/nv-ingest-mig-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: nv-ingest-mig-config 5 | data: 6 | config.yaml: | 7 | version: v1 8 | mig-configs: 9 | all-disabled: 10 | - devices: all 11 | mig-enabled: false 12 | 13 | single-gpu-nv-ingest: 14 | - devices: [0] 15 | mig-enabled: true 16 | mig-devices: 17 | "1g.10gb": 7 18 | - devices: [1] 19 | mig-enabled: true 20 | mig-devices: 21 | "7g.80gb": 1 22 | - devices: [2] 23 | mig-enabled: true 24 | mig-devices: 25 | "7g.80gb": 1 26 | - devices: [3] 27 | mig-enabled: true 28 | mig-devices: 29 | "7g.80gb": 1 30 | - devices: [4] 31 | mig-enabled: true 32 | mig-devices: 33 | "7g.80gb": 1 34 | - devices: [5] 35 | mig-enabled: true 36 | mig-devices: 37 | "7g.80gb": 1 38 | - devices: [6] 39 | mig-enabled: true 40 | mig-devices: 41 | "7g.80gb": 1 42 | - devices: [7] 43 | mig-enabled: true 44 | mig-devices: 45 | "7g.80gb": 1 46 | -------------------------------------------------------------------------------- /helm/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Installed {{ .Chart.Name }}-{{ .Chart.Version }}, named {{ .Release.Name }}. 2 | Visit the application via: 3 | {{- if .Values.ingress.enabled }} 4 | {{- range $host := .Values.ingress.hosts }} 5 | {{- range .paths }} 6 | http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} 7 | {{- end }} 8 | {{- end }} 9 | {{- else if and .Values.virtualService .Values.virtualService.enabled }} 10 | https://{{ .Values.virtualService.dnsName }} 11 | {{- end }} 12 | 13 | To learn more about the release, try: 14 | 15 | $ helm status {{ .Release.Name }} 16 | $ helm get {{ .Release.Name }} 17 | $ helm test {{ .Release.Name }} 18 | -------------------------------------------------------------------------------- /helm/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | {{- if not .Values.extraEnvVarsCM }} 2 | --- 3 | apiVersion: v1 4 | kind: ConfigMap 5 | metadata: 6 | name: {{ include "nv-ingest.fullname" . }} 7 | data: 8 | {{- end }} 9 | -------------------------------------------------------------------------------- /helm/templates/hpa.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.autoscaling.enabled }} 2 | --- 3 | apiVersion: autoscaling/v2 4 | kind: HorizontalPodAutoscaler 5 | metadata: 6 | name: {{ include "nv-ingest.fullname" . }} 7 | labels: 8 | {{- include "nv-ingest.labels" . | nindent 4 }} 9 | spec: 10 | scaleTargetRef: 11 | apiVersion: apps/v1 12 | kind: Deployment 13 | name: {{ include "nv-ingest.fullname" . }} 14 | minReplicas: {{ .Values.autoscaling.minReplicas }} 15 | maxReplicas: {{ .Values.autoscaling.maxReplicas }} 16 | metrics: 17 | {{- range .Values.autoscaling.metrics }} 18 | - {{- . | toYaml | nindent 10 }} 19 | {{- end }} 20 | {{- end }} 21 | -------------------------------------------------------------------------------- /helm/templates/secrets.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | {{ if .Values.ngcImagePullSecret.create -}} 6 | --- 7 | apiVersion: v1 8 | kind: Secret 9 | metadata: 10 | name: ngc-secret # name expected by NIMs 11 | type: kubernetes.io/dockerconfigjson 12 | data: 13 | .dockerconfigjson: {{ template "nv-ingest.ngcImagePullSecret" . }} 14 | {{- end }} 15 | 16 | 17 | {{ if and .Values.ngcApiSecret.create -}} 18 | --- 19 | apiVersion: v1 20 | kind: Secret 21 | metadata: 22 | name: ngc-api # Name expected by NIMs 23 | type: Opaque 24 | stringData: 25 | NGC_CLI_API_KEY: {{ template "nv-ingest.ngcApiSecret" . }} 26 | NGC_API_KEY: {{ template "nv-ingest.ngcApiSecret" . }} 27 | {{- end }} 28 | -------------------------------------------------------------------------------- /helm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: {{ .Values.service.name | default (include "nv-ingest.fullname" .) }} 6 | labels: 7 | {{- include "nv-ingest.labels" . | nindent 4 }} 8 | {{- if .Values.service.labels }} 9 | {{- toYaml .Values.service.labels | nindent 4 }} 10 | {{- end }} 11 | annotations: 12 | {{- if .Values.service.annotations }} 13 | {{- toYaml .Values.service.annotations | nindent 4 }} 14 | {{- end }} 15 | spec: 16 | type: {{ .Values.service.type }} 17 | ports: 18 | {{- if .Values.service.port }} 19 | - port: {{ .Values.service.port }} 20 | targetPort: http 21 | protocol: TCP 22 | name: nv-ingest-http 23 | {{- end }} 24 | {{- if .Values.service.nodePort }} 25 | {{- with .Values.service.nodePort }} 26 | nodePort: {{ . }} 27 | {{- end }} 28 | {{- end }} 29 | selector: 30 | {{- include "nv-ingest.selectorLabels" . | nindent 4 }} 31 | -------------------------------------------------------------------------------- /helm/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | --- 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: {{ include "nv-ingest.serviceAccountName" . }} 7 | labels: 8 | {{- include "nv-ingest.labels" . | nindent 4 }} 9 | {{- with .Values.serviceAccount.annotations }} 10 | annotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 14 | {{- end }} 15 | -------------------------------------------------------------------------------- /helm/time-slicing/time-slicing-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: time-slicing-config 5 | data: 6 | any: |- 7 | version: v1 8 | flags: 9 | migStrategy: none 10 | sharing: 11 | timeSlicing: 12 | renameByDefault: false 13 | failRequestsGreaterThanOne: false 14 | resources: 15 | - name: nvidia.com/gpu 16 | replicas: 16 17 | -------------------------------------------------------------------------------- /helm/update_helm_readme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # script for updating the helm README.md file with the helm-docs 4 | # More complicated tasks are envisioned and hence the existence of this script given its simple nature 5 | 6 | helm-docs 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = 3 | api/tests 4 | tests 5 | markers = 6 | integration: mark a test as an integration test 7 | addopts = -m "not integration" 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import datetime 6 | import os 7 | import re 8 | 9 | from setuptools import find_packages 10 | from setuptools import setup 11 | 12 | 13 | def get_version(): 14 | release_type = os.getenv("NV_INGEST_RELEASE_TYPE", "dev") 15 | version = os.getenv("NV_INGEST_VERSION") 16 | rev = os.getenv("NV_INGEST_REV", "0") 17 | 18 | if not version: 19 | version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}" 20 | 21 | # Ensure the version is PEP 440 compatible 22 | pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$" 23 | if not re.match(pep440_regex, version): 24 | raise ValueError(f"Version '{version}' is not PEP 440 compatible") 25 | 26 | # Construct the final version string 27 | if release_type == "dev": 28 | final_version = f"{version}.dev{rev}" 29 | elif release_type == "release": 30 | final_version = f"{version}.post{rev}" if int(rev) > 0 else version 31 | else: 32 | raise ValueError(f"Invalid release type: {release_type}") 33 | 34 | return final_version 35 | 36 | 37 | def read_requirements(file_name): 38 | """Read a requirements file and return a list of its packages.""" 39 | with open(file_name) as f: 40 | return f.read().splitlines() 41 | 42 | 43 | # Specify your requirements files 44 | requirements_files = [] 45 | 46 | # Read and combine requirements from all specified files 47 | combined_requirements = [] 48 | for file in requirements_files: 49 | combined_requirements.extend(read_requirements(file)) 50 | 51 | combined_requirements = list(set(combined_requirements)) 52 | 53 | setup( 54 | author="Devin Robison", 55 | author_email="drobison@nvidia.com", 56 | classifiers=[], 57 | description="Python module supporting document ingestion", 58 | install_requires=combined_requirements, 59 | license="Apache-2.0", 60 | name="nv_ingest", 61 | package_dir={"": "src"}, 62 | packages=find_packages(where="src"), 63 | python_requires=">=3.10", 64 | version=get_version(), 65 | ) 66 | -------------------------------------------------------------------------------- /skaffold/README.md: -------------------------------------------------------------------------------- 1 | # Skaffold - NV-Ingest Development Team Only 2 | 3 | 4 | Skaffold is intended to support the NV-Ingest development team with Kubernetes development and testing. It is not meant to be used in production deployments nor for local testing. 5 | 6 | We offer Kubernetes support through Helm and you can find those instructions at [Helm Documentation](../helm/README.md). 7 | -------------------------------------------------------------------------------- /skaffold/sensitive/.gitignore: -------------------------------------------------------------------------------- 1 | *.yaml 2 | -------------------------------------------------------------------------------- /src/ingest_pipeline_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "image_caption_extraction_module": {}, 3 | "image_storage_module": {}, 4 | "metadata_injection_module": {}, 5 | "pdf_extractor_module": {}, 6 | "redis_task_sink": {}, 7 | "redis_task_source": {}, 8 | "text_splitting_module": {}, 9 | "otel_meter_module": {}, 10 | "embed_extractions_module": {}, 11 | "vdb_task_sink_module": {} 12 | } 13 | -------------------------------------------------------------------------------- /src/nv_ingest/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import warnings 6 | 7 | 8 | # Suppressing CUDA-related warnings when running NV-Ingest on a CPU-only system. 9 | # 10 | # The warnings originate from Numba, which attempts to initialize CUDA even if no GPU is available. 11 | # These warnings include errors about missing CUDA drivers or failing to dlopen `libcuda.so.1`. 12 | # 13 | # By temporarily ignoring `UserWarning` during the import, we prevent unnecessary clutter in logs 14 | # while ensuring that cuDF still functions in CPU mode. 15 | # 16 | # Note: This does not affect cuDF behavior - it will still fall back to CPU execution if no GPU is detected. 17 | with warnings.catch_warnings(): 18 | warnings.simplefilter("ignore", category=UserWarning) 19 | import cudf 20 | -------------------------------------------------------------------------------- /src/nv_ingest/api/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/api/main.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import logging 6 | import os 7 | 8 | from fastapi import FastAPI 9 | from opentelemetry import trace 10 | from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter 11 | from opentelemetry.sdk.resources import Resource 12 | from opentelemetry.sdk.trace import TracerProvider 13 | from opentelemetry.sdk.trace.export import BatchSpanProcessor 14 | 15 | from .v1.health import router as HealthApiRouter 16 | from .v1.ingest import router as IngestApiRouter 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | # nv-ingest FastAPI app declaration 21 | app = FastAPI( 22 | title="NV-Ingest Microservice", 23 | description="Service for ingesting heterogenous datatypes", 24 | version="25.4.2", 25 | contact={ 26 | "name": "NVIDIA Corporation", 27 | "url": "https://nvidia.com", 28 | }, 29 | docs_url="/docs", 30 | ) 31 | 32 | app.include_router(IngestApiRouter, prefix="/v1") 33 | app.include_router(HealthApiRouter, prefix="/v1/health") 34 | 35 | # Set up the tracer provider and add a processor for exporting traces 36 | resource = Resource(attributes={"service.name": "nv-ingest"}) 37 | trace.set_tracer_provider(TracerProvider(resource=resource)) 38 | tracer = trace.get_tracer(__name__) 39 | 40 | otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "otel-collector:4317") 41 | exporter = OTLPSpanExporter(endpoint=otel_endpoint, insecure=True) 42 | span_processor = BatchSpanProcessor(exporter) 43 | trace.get_tracer_provider().add_span_processor(span_processor) 44 | -------------------------------------------------------------------------------- /src/nv_ingest/api/v1/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/injectors/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .metadata_injector import MetadataInjectorLoaderFactory 6 | from .task_injection import TaskInjectorLoaderFactory 7 | 8 | __all__ = ["MetadataInjectorLoaderFactory", "TaskInjectorLoaderFactory"] 9 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/injectors/task_injection.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | 8 | import mrc 9 | from morpheus.utils.module_utils import ModuleLoaderFactory 10 | from morpheus.utils.module_utils import register_module 11 | 12 | from nv_ingest.framework.schemas.framework_task_injection_schema import TaskInjectionSchema 13 | from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_context_manager 14 | from nv_ingest.framework.orchestration.morpheus.util.modules.config_validator import ( 15 | fetch_and_validate_module_config, 16 | ) 17 | from nv_ingest_api.internal.primitives.tracing.tagging import traceable 18 | from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | MODULE_NAME = "task_injection" 23 | MODULE_NAMESPACE = "nv_ingest" 24 | 25 | TaskInjectorLoaderFactory = ModuleLoaderFactory(MODULE_NAME, MODULE_NAMESPACE, TaskInjectionSchema) 26 | 27 | 28 | def on_data(message: IngestControlMessage): 29 | message.get_metadata("task_meta") 30 | 31 | return message 32 | 33 | 34 | @register_module(MODULE_NAME, MODULE_NAMESPACE) 35 | def _task_injection(builder: mrc.Builder): 36 | validated_config = fetch_and_validate_module_config(builder, TaskInjectionSchema) 37 | 38 | @nv_ingest_node_failure_context_manager( 39 | annotation_id=MODULE_NAME, 40 | raise_on_failure=validated_config.raise_on_failure, 41 | ) 42 | @traceable(MODULE_NAME) 43 | def _on_data(ctrl_msg: IngestControlMessage): 44 | return on_data(ctrl_msg) 45 | ctrl_msg.get_metadata("task_meta") 46 | 47 | return ctrl_msg 48 | 49 | node = builder.make_node("vdb_resource_tagging", on_data) 50 | 51 | builder.register_module_input("input", node) 52 | builder.register_module_output("output", node) 53 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/sinks/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .message_broker_task_sink import MessageBrokerTaskSinkLoaderFactory 6 | 7 | __all__ = ["MessageBrokerTaskSinkLoaderFactory"] 8 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/sources/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .message_broker_task_source import MessageBrokerTaskSourceLoaderFactory 6 | 7 | __all__ = ["MessageBrokerTaskSourceLoaderFactory"] 8 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/storages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/nv_ingest/framework/orchestration/morpheus/modules/storages/__init__.py -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/telemetry/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/nv_ingest/framework/orchestration/morpheus/modules/telemetry/__init__.py -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/modules/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .text_splitter import TextSplitterLoaderFactory 6 | 7 | __all__ = ["TextSplitterLoaderFactory"] 8 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/meta/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/meta/linear_module_source_stage_cpu.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from morpheus.config import ExecutionMode 6 | from morpheus.stages.general.linear_modules_source import LinearModuleSourceStage 7 | from morpheus.stages.general.linear_modules_stage import LinearModulesStage 8 | 9 | 10 | class LinearModuleSourceStageCPU(LinearModuleSourceStage): 11 | def supported_execution_modes(self) -> tuple[ExecutionMode]: 12 | # Provide your own logic here; for example: 13 | return (ExecutionMode.CPU,) 14 | 15 | 16 | class LinearModuleStageCPU(LinearModulesStage): 17 | def supported_execution_modes(self) -> tuple[ExecutionMode]: 18 | # Provide your own logic here; for example: 19 | return (ExecutionMode.CPU,) 20 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/mutate/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from .image_dedup import generate_dedup_stage 7 | from .image_filter import generate_image_filter_stage 8 | 9 | __all__ = ["generate_dedup_stage", "generate_image_filter_stage"] 10 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/store/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/stages/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from .image_caption_extraction import generate_caption_extraction_stage 7 | 8 | __all__ = ["generate_caption_extraction_stage"] 9 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/util/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/util/modules/config_validator.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import logging 6 | 7 | import mrc 8 | from pydantic import ValidationError 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def fetch_and_validate_module_config(builder: mrc.Builder, schema_class): 14 | """ 15 | Validates the configuration of a module using a specified Pydantic schema class. 16 | 17 | Parameters 18 | ---------- 19 | builder : object 20 | The builder object used to access the current module's configuration. 21 | schema_class : Pydantic BaseModel 22 | The schema class to be used for validating the module configuration. 23 | 24 | Raises 25 | ------ 26 | ValueError 27 | If the module configuration fails validation according to the schema class. 28 | """ 29 | module_config = builder.get_current_module_config() 30 | try: 31 | validated_config = schema_class(**module_config) 32 | except ValidationError as e: 33 | error_messages = "; ".join([f"{error['loc'][0]}: {error['msg']}" for error in e.errors()]) 34 | log_error_message = f"Invalid configuration: {error_messages}" 35 | logger.error(log_error_message) 36 | raise ValueError(log_error_message) 37 | 38 | return validated_config 39 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/util/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from .pipeline_builders import setup_ingestion_pipeline 6 | from .stage_builders import ( 7 | add_sink_stage, 8 | add_source_stage, 9 | add_submitted_job_counter_stage, 10 | add_metadata_injector_stage, 11 | add_pdf_extractor_stage, 12 | add_image_extractor_stage, 13 | add_docx_extractor_stage, 14 | add_pptx_extractor_stage, 15 | add_image_dedup_stage, 16 | add_image_filter_stage, 17 | add_table_extractor_stage, 18 | add_chart_extractor_stage, 19 | add_image_caption_stage, 20 | add_text_splitter_stage, 21 | add_embed_extractions_stage, 22 | add_embedding_storage_stage, 23 | add_image_storage_stage, 24 | add_vdb_task_sink_stage, 25 | ) 26 | 27 | __all__ = [ 28 | "setup_ingestion_pipeline", 29 | "add_sink_stage", 30 | "add_source_stage", 31 | "add_submitted_job_counter_stage", 32 | "add_metadata_injector_stage", 33 | "add_pdf_extractor_stage", 34 | "add_image_extractor_stage", 35 | "add_docx_extractor_stage", 36 | "add_pptx_extractor_stage", 37 | "add_image_dedup_stage", 38 | "add_image_filter_stage", 39 | "add_table_extractor_stage", 40 | "add_chart_extractor_stage", 41 | "add_image_caption_stage", 42 | "add_text_splitter_stage", 43 | "add_embed_extractions_stage", 44 | "add_embedding_storage_stage", 45 | "add_image_storage_stage", 46 | "add_vdb_task_sink_stage", 47 | ] 48 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/orchestration/morpheus/util/pipeline/logging.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from morpheus.utils.logger import configure_logging 7 | 8 | from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging 9 | from nv_ingest.framework.orchestration.morpheus.util.pipeline.stage_builders import * 10 | 11 | # Convert log level from string to logging level 12 | _log_level_mapping = { 13 | "DEBUG": logging.DEBUG, 14 | "INFO": logging.INFO, 15 | "WARNING": logging.WARNING, 16 | "ERROR": logging.ERROR, 17 | "CRITICAL": logging.CRITICAL, 18 | } 19 | 20 | 21 | def get_log_level(str_level): 22 | """ 23 | Converts the log level from a string to a logging level. 24 | """ 25 | return _log_level_mapping.get(str_level.upper(), logging.INFO) 26 | 27 | 28 | def setup_logging(log_level): 29 | """ 30 | Configures logging based on the provided log level or the INGEST_LOG_LEVEL environment variable. 31 | """ 32 | # Check for INGEST_LOG_LEVEL environment variable 33 | env_log_level = os.getenv("INGEST_LOG_LEVEL", log_level) 34 | if env_log_level: 35 | log_level = env_log_level 36 | if log_level in ("DEFAULT",): 37 | log_level = "INFO" 38 | 39 | log_level_value = _log_level_mapping.get(log_level.upper(), logging.INFO) 40 | logging.basicConfig(level=log_level_value, format="%(asctime)s - %(levelname)s - %(message)s") 41 | configure_logging(log_level=log_level_value) 42 | configure_local_logging(logger, log_level_value) 43 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/nv_ingest/framework/schemas/__init__.py -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_job_counter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import ConfigDict, BaseModel 7 | 8 | 9 | class JobCounterSchema(BaseModel): 10 | name: str = "job_counter" 11 | raise_on_failure: bool = False 12 | model_config = ConfigDict(extra="forbid") 13 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import Field, BaseModel 7 | 8 | from typing_extensions import Annotated 9 | 10 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema 11 | 12 | 13 | class MessageBrokerTaskSinkSchema(BaseModel): 14 | broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema() 15 | 16 | raise_on_failure: bool = False 17 | 18 | progress_engines: Annotated[int, Field(ge=1)] = 6 19 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_message_broker_source_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import Field, BaseModel 7 | 8 | from typing_extensions import Annotated 9 | 10 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema 11 | 12 | 13 | class MessageBrokerTaskSourceSchema(BaseModel): 14 | broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema() 15 | 16 | task_queue: str = "morpheus_task_queue" 17 | raise_on_failure: bool = False 18 | 19 | progress_engines: Annotated[int, Field(ge=1)] = 6 20 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_message_wrapper_schema.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class MessageWrapper(BaseModel): 5 | payload: str 6 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_metadata_injector_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | 8 | from pydantic import ConfigDict, BaseModel 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class MetadataInjectorSchema(BaseModel): 14 | raise_on_failure: bool = False 15 | model_config = ConfigDict(extra="forbid") 16 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_otel_meter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import ConfigDict, BaseModel 7 | 8 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema 9 | 10 | 11 | class OpenTelemetryMeterSchema(BaseModel): 12 | broker_client: MessageBrokerClientSchema = MessageBrokerClientSchema() 13 | 14 | otel_endpoint: str = "localhost:4317" 15 | raise_on_failure: bool = False 16 | model_config = ConfigDict(extra="forbid") 17 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_otel_tracer_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from pydantic import ConfigDict, BaseModel 7 | 8 | 9 | class OpenTelemetryTracerSchema(BaseModel): 10 | otel_endpoint: str = "localhost:4317" 11 | raise_on_failure: bool = False 12 | model_config = ConfigDict(extra="forbid") 13 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_processing_job_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from pydantic import BaseModel, ConfigDict 6 | from enum import Enum 7 | 8 | 9 | class ConversionStatus(str, Enum): 10 | IN_PROGRESS = "in_progress" 11 | SUCCESS = "success" 12 | FAILED = "failed" 13 | 14 | model_config = ConfigDict(extra="forbid") 15 | 16 | 17 | class ProcessingJob(BaseModel): 18 | submitted_job_id: str 19 | filename: str 20 | raw_result: str = "" 21 | content: str = "" 22 | status: ConversionStatus 23 | error: str | None = None 24 | 25 | model_config = ConfigDict(extra="forbid") 26 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/schemas/framework_task_injection_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import logging 7 | 8 | from pydantic import ConfigDict, BaseModel 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TaskInjectionSchema(BaseModel): 14 | raise_on_failure: bool = False 15 | model_config = ConfigDict(extra="forbid") 16 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/flow_control/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | from .filter_by_task import filter_by_task 7 | 8 | __all__ = ["filter_by_task"] 9 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/service/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/service/impl/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/service/impl/ingest/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/service/meta/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/service/meta/ingest/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from abc import ABC 6 | from abc import abstractmethod 7 | from typing import List, Optional 8 | 9 | from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper 10 | from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob 11 | from nv_ingest_api.util.service_clients.client_base import FetchMode 12 | 13 | 14 | class IngestServiceMeta(ABC): 15 | @abstractmethod 16 | async def submit_job(self, job_spec: MessageWrapper, trace_id: str) -> str: 17 | """Abstract method for submitting one or more jobs to the ingestion pipeline""" 18 | 19 | @abstractmethod 20 | async def fetch_job(self, job_id: str): 21 | """Abstract method for fetching job from ingestion service based on job_id""" 22 | 23 | @abstractmethod 24 | async def set_processing_cache(self, job_id: str, jobs_data: List[ProcessingJob]) -> None: 25 | """Abstract method for setting processing cache""" 26 | 27 | @abstractmethod 28 | async def get_processing_cache(self, job_id: str) -> List[ProcessingJob]: 29 | """Abstract method for getting processing cache""" 30 | 31 | @abstractmethod 32 | async def set_job_state(self, job_id: str, state: str, ttl: int = 86400): 33 | """Abstract method for setting job state""" 34 | 35 | @abstractmethod 36 | async def get_job_state(self, job_id: str) -> Optional[str]: 37 | """Abstract method for getting job state""" 38 | 39 | @abstractmethod 40 | async def get_fetch_mode(self) -> FetchMode: 41 | """Abstract method for getting fetch mode""" 42 | -------------------------------------------------------------------------------- /src/nv_ingest/framework/util/telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /src/util/image_model_validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nv-ingest/be6d07cc2f0c4b300467baa4d0e5244d54f86f61/src/util/image_model_validation/__init__.py -------------------------------------------------------------------------------- /src/util/image_model_validation/deplot.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import logging 6 | 7 | import click 8 | from util import display_image 9 | from util import initialize_triton_client 10 | from util import load_and_preprocess_image 11 | from util import perform_inference 12 | from util import prepare_input_tensor 13 | from util import prepare_output_tensor 14 | from util import print_output_results 15 | from util import validate_output 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | @click.command() 21 | @click.argument("image_path", type=click.Path(exists=True)) 22 | @click.option("--display", is_flag=True, help="Display the image before sending it for inference.") 23 | def main(image_path, display): 24 | # Configuration 25 | url = "localhost:8004" 26 | model_name = "deplot" 27 | batch_size = 1 28 | target_img_size = (1024, 1024) 29 | 30 | # Workflow 31 | triton_client = initialize_triton_client(url) 32 | resized_image, input_data = load_and_preprocess_image(image_path, target_img_size) 33 | 34 | if display: 35 | display_image(resized_image) 36 | 37 | input_dims = input_data.shape[1:] # Exclude batch dimension 38 | logger.info(f"Detected input dimensions: {input_dims}") 39 | 40 | inputs = prepare_input_tensor(input_data) 41 | outputs = prepare_output_tensor() 42 | 43 | results = perform_inference(triton_client, model_name, inputs, outputs) 44 | output_data = results.as_numpy("output") 45 | 46 | validate_output(output_data, batch_size) 47 | print_output_results(output_data) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /src/util/image_model_validation/paddle.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import click 6 | import numpy as np 7 | from util import display_image 8 | from util import initialize_triton_client 9 | from util import load_and_preprocess_image 10 | from util import perform_inference 11 | from util import prepare_input_tensor 12 | from util import prepare_output_tensor 13 | from util import print_output_results 14 | from util import validate_output 15 | 16 | 17 | @click.command() 18 | @click.argument("image_path", type=click.Path(exists=True)) 19 | @click.option("--display", is_flag=True, help="Display the image before sending it for inference.") 20 | def main(image_path, display): 21 | # Triton server URL and Model details 22 | url = "localhost:8010" 23 | model_name = "paddle" 24 | batch_size = 1 25 | target_img_size = (1024, 1024) 26 | 27 | # Load and preprocess image 28 | resized_image, input_data = load_and_preprocess_image(image_path, target_img_size) 29 | resized_images = np.expand_dims(resized_image, axis=0) # Add batch dimension 30 | 31 | # Optionally display the image 32 | if display: 33 | display_image(resized_image) 34 | 35 | # Detect input dimensions from the loaded image 36 | input_dims = input_data.shape[1:] # Exclude the batch dimension 37 | print(f"Detected input dimensions: {input_dims}") 38 | 39 | # Initialize Triton gRPC client 40 | triton_client = initialize_triton_client(url) 41 | 42 | # Prepare input and output tensors 43 | inputs = prepare_input_tensor(resized_images) 44 | outputs = prepare_output_tensor() 45 | 46 | # Call the Triton server for inference 47 | results = perform_inference(triton_client, model_name, inputs, outputs) 48 | 49 | # Get output data 50 | output_data = results.as_numpy("output") 51 | 52 | # Validate output size 53 | validate_output(output_data, batch_size) 54 | 55 | # Print the output results 56 | print_output_results(output_data) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/functional/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/functional/test_ingest_pipeline.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # redis config 6 | _DEFAULT_REDIS_HOST = "redis" 7 | _DEFAULT_REDIS_PORT = 6379 8 | 9 | # job config 10 | _DEFAULT_TASK_QUEUE = "morpheus_task_queue" 11 | _DEFAULT_JOB_TIMEOUT = 90 12 | 13 | # extract_config 14 | _DEFAULT_EXTRACT_PAGE_DEPTH = "document" 15 | _DEFAULT_EXTRACT_TABLES_METHOD = "yolox" 16 | 17 | # split config 18 | _DEFAULT_SPLIT_BY = "word" 19 | _DEFAULT_SPLIT_LENGTH = 300 20 | _DEFAULT_SPLIT_OVERLAP = 10 21 | _DEFAULT_SPLIT_MAX_CHARACTER_LENGTH = 5000 22 | _DEFAULT_SPLIT_SENTENCE_WINDOW_SIZE = 0 23 | 24 | # file config 25 | _VALIDATION_PDF = "data/functional_validation.pdf" 26 | _VALIDATION_JSON = "data/functional_validation.json" 27 | 28 | 29 | def remove_keys(data, keys_to_remove): 30 | if isinstance(data, dict): 31 | return {k: remove_keys(v, keys_to_remove) for k, v in data.items() if k not in keys_to_remove} 32 | elif isinstance(data, list): 33 | return [remove_keys(item, keys_to_remove) for item in data] 34 | else: 35 | return data 36 | -------------------------------------------------------------------------------- /tests/import_checks.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | def check_morpheus_import(): 7 | try: 8 | import morpheus 9 | 10 | _ = morpheus._version 11 | 12 | return True 13 | except Exception as e: 14 | print(f"\nError: {e}\n", flush=True) 15 | return False 16 | 17 | 18 | def check_cuda_driver(): 19 | try: 20 | import cupy 21 | 22 | import cudf 23 | 24 | _ = cupy.cuda.runtime.driverGetVersion() 25 | _ = cudf.DataFrame({"a": [1, 2, 3]}) 26 | return True 27 | except Exception as e: 28 | print(f"\nError: {e}\n", flush=True) 29 | return False 30 | 31 | 32 | def check_adobe_import(): 33 | try: 34 | pass 35 | 36 | return True 37 | except ImportError: 38 | return False 39 | 40 | 41 | ADOBE_IMPORT_OK = check_adobe_import() 42 | CUDA_DRIVER_OK = check_cuda_driver() 43 | MORPHEUS_IMPORT_OK = check_morpheus_import() 44 | -------------------------------------------------------------------------------- /tests/integration/test_examples.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | import pytest 5 | 6 | 7 | @pytest.mark.integration 8 | def test_launch_libmode_and_run_ingestor(): 9 | process = subprocess.run( 10 | [sys.executable, "./examples/launch_libmode_and_run_ingestor.py"], capture_output=True, text=True 11 | ) 12 | 13 | try: 14 | assert process.returncode == 0 15 | # pdfium text 16 | assert "A sample document with headings and placeholder text" in process.stdout 17 | except: 18 | print(process.stdout) 19 | print(process.stderr) 20 | raise 21 | -------------------------------------------------------------------------------- /tests/integration/test_extract_audio.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import time 4 | 5 | import pytest 6 | from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient 7 | from nv_ingest_client.client import Ingestor 8 | from nv_ingest_client.client import NvIngestClient 9 | 10 | 11 | @pytest.mark.integration 12 | def test_audio_extract_only( 13 | pipeline_process, 14 | ): 15 | client = NvIngestClient( 16 | message_client_allocator=SimpleClient, 17 | message_client_port=7671, 18 | message_client_hostname="localhost", 19 | ) 20 | 21 | ingestor = Ingestor(client=client).files("./data/multimodal_test.wav").extract() 22 | 23 | results = ingestor.ingest() 24 | assert len(results) == 1 25 | 26 | transcript = results[0][0]["metadata"]["audio_metadata"]["audio_transcript"] 27 | expected = ( 28 | "Section one, this is the first section of the document. " 29 | "It has some more placeholder text to show how the document looks like. " 30 | "The text is not meant to be meaningful or informative, " 31 | "but rather to demonstrate the layout and formatting of the document." 32 | ) 33 | assert transcript == expected 34 | -------------------------------------------------------------------------------- /tests/service_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/modules/injectors/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/modules/injectors/test_task_injector.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from unittest.mock import MagicMock 6 | 7 | import pytest 8 | 9 | try: 10 | from nv_ingest.framework.orchestration.morpheus.modules.injectors.task_injection import on_data 11 | 12 | morpheus_import = True 13 | except: 14 | morpheus_import = False 15 | 16 | 17 | @pytest.fixture 18 | def mock_message(): 19 | """Fixture to create and return a mock IngestControlMessage object.""" 20 | return MagicMock() 21 | 22 | 23 | @pytest.mark.skipif(not morpheus_import, reason="Morpheus modules are not available") 24 | def test_on_data_returns_message(mock_message): 25 | """Test that on_data returns the same IngestControlMessage object it receives.""" 26 | result = on_data(mock_message) 27 | assert result is mock_message, "on_data should return the input IngestControlMessage object." 28 | 29 | 30 | @pytest.mark.skipif(not morpheus_import, reason="Morpheus modules are not available") 31 | def test_on_data_calls_get_metadata_with_correct_arguments(mock_message): 32 | """Test that on_data calls get_metadata on the IngestControlMessage object with correct arguments.""" 33 | on_data(mock_message) 34 | mock_message.get_metadata.assert_called_once_with("task_meta") 35 | -------------------------------------------------------------------------------- /tests/service_tests/modules/sinks/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/modules/sources/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/modules/storages/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/modules/storages/test_image_storage.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pandas as pd 6 | import pytest 7 | from minio import Minio 8 | 9 | from nv_ingest_api.internal.enums.common import ContentTypeEnum 10 | from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage 11 | 12 | 13 | class MockMinioClient: 14 | def __init__(self, *args, **kwargs): 15 | pass 16 | 17 | def make_bucket(self, *args, **kwargs): 18 | return 19 | 20 | def put_object(self, *args, **kwargs): 21 | return 22 | 23 | def bucket_exists(self, *args, **kwargs): 24 | return True 25 | 26 | 27 | @pytest.fixture 28 | def mock_minio(mocker): 29 | def mock_minio_init( 30 | cls, 31 | *args, 32 | **kwargs, 33 | ): 34 | return MockMinioClient(*args, **kwargs) 35 | 36 | patched = mocker.patch.object(Minio, "__new__", new=mock_minio_init) 37 | yield patched 38 | -------------------------------------------------------------------------------- /tests/service_tests/modules/telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_image_dedup_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import pytest 7 | from pydantic import ValidationError 8 | 9 | from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema 10 | 11 | 12 | def valid_module_config(): 13 | """Returns a valid job payload for testing purposes.""" 14 | return { 15 | "raise_on_failure": True, 16 | } 17 | 18 | 19 | def test_task_type_str_bool(): 20 | img_dedup_module_config = valid_module_config() 21 | img_dedup_module_config["raise_on_failure"] = bool(img_dedup_module_config["raise_on_failure"]) 22 | _ = ImageDedupSchema(**img_dedup_module_config) 23 | 24 | 25 | @pytest.mark.parametrize("dtype", [int, float, str]) 26 | def test_task_type_str_bool_sensitivity(dtype): 27 | img_dedup_module_config = valid_module_config() 28 | img_dedup_module_config["raise_on_failure"] = dtype(img_dedup_module_config["raise_on_failure"]) 29 | 30 | with pytest.raises(ValidationError): 31 | _ = ImageDedupSchema(**img_dedup_module_config) 32 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_image_filter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | 6 | import pytest 7 | from pydantic import ValidationError 8 | 9 | from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema 10 | 11 | 12 | def valid_module_config(): 13 | """Returns a valid job payload for testing purposes.""" 14 | return { 15 | "raise_on_failure": True, 16 | "cpu_only": True, 17 | } 18 | 19 | 20 | def test_task_type_str_bool(): 21 | img_filter_module_config = valid_module_config() 22 | img_filter_module_config["raise_on_failure"] = bool(img_filter_module_config["raise_on_failure"]) 23 | img_filter_module_config["cpu_only"] = bool(img_filter_module_config["cpu_only"]) 24 | _ = ImageFilterSchema(**img_filter_module_config) 25 | 26 | 27 | @pytest.mark.parametrize("dtype", [int, float, str]) 28 | def test_task_type_str_bool_sensitivity(dtype): 29 | img_filter_module_config = valid_module_config() 30 | img_filter_module_config["raise_on_failure"] = dtype(img_filter_module_config["raise_on_failure"]) 31 | img_filter_module_config["cpu_only"] = dtype(img_filter_module_config["cpu_only"]) 32 | 33 | with pytest.raises(ValidationError): 34 | _ = ImageFilterSchema(**img_filter_module_config) 35 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_injection_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pytest 6 | from pydantic import ValidationError 7 | 8 | from nv_ingest.framework.schemas.framework_task_injection_schema import TaskInjectionSchema 9 | 10 | 11 | def test_task_injection_schema_default(): 12 | """ 13 | Test TaskInjectionSchema with default values. 14 | """ 15 | schema = TaskInjectionSchema() 16 | assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False." 17 | 18 | 19 | def test_task_injection_schema_explicit_value(): 20 | """ 21 | Test TaskInjectionSchema with an explicit value for raise_on_failure. 22 | """ 23 | schema = TaskInjectionSchema(raise_on_failure=True) 24 | assert schema.raise_on_failure is True, "raise_on_failure should respect the explicitly provided value." 25 | 26 | 27 | def test_task_injection_schema_forbids_extra(): 28 | """ 29 | Test that TaskInjectionSchema forbids extra fields due to the 'extra = "forbid"' configuration. 30 | """ 31 | with pytest.raises(ValidationError) as excinfo: 32 | TaskInjectionSchema(raise_on_failure=False, unexpected_field="value") 33 | assert "Extra inputs are not permitted" in str(excinfo.value), "Schema should not allow extra fields." 34 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_job_counter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest.framework.schemas.framework_job_counter_schema import JobCounterSchema 6 | 7 | 8 | def test_job_counter_schema_defaults(): 9 | schema = JobCounterSchema() 10 | assert schema.name == "job_counter", "Default value for name should be 'job_counter'." 11 | assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False." 12 | 13 | 14 | def test_job_counter_schema_custom_values(): 15 | schema = JobCounterSchema(name="foo", raise_on_failure=True) 16 | 17 | assert schema.name == "foo", "Custom value for name should be respected." 18 | assert schema.raise_on_failure is True, "Custom value for raise_on_failure should be respected." 19 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_metadata_injector_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | import pytest 6 | from pydantic import ValidationError 7 | 8 | from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema 9 | 10 | 11 | def test_metadata_injector_schema_default(): 12 | """ 13 | Test the MetadataInjectorSchema with default values. 14 | """ 15 | schema = MetadataInjectorSchema() 16 | assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False." 17 | 18 | 19 | def test_metadata_injector_schema_explicit_value(): 20 | """ 21 | Test the MetadataInjectorSchema with an explicit value for raise_on_failure. 22 | """ 23 | schema = MetadataInjectorSchema(raise_on_failure=True) 24 | assert schema.raise_on_failure is True, "raise_on_failure should respect the explicitly provided value." 25 | 26 | 27 | def test_metadata_injector_schema_forbids_extra(): 28 | """ 29 | Test that the MetadataInjectorSchema forbids extra fields due to the 'extra = "forbid"' configuration. 30 | """ 31 | with pytest.raises(ValidationError) as excinfo: 32 | MetadataInjectorSchema(raise_on_failure=False, unexpected_field="value") 33 | assert "Extra inputs are not permitted" in str(excinfo.value), "Schema should not allow extra fields." 34 | 35 | 36 | @pytest.mark.parametrize("input_value", [True, False]) 37 | def test_metadata_injector_schema_raise_on_failure_parametrized(input_value): 38 | """ 39 | Parametrized test for different boolean values of raise_on_failure. 40 | """ 41 | schema = MetadataInjectorSchema(raise_on_failure=input_value) 42 | assert schema.raise_on_failure is input_value, f"raise_on_failure should be {input_value}." 43 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_otel_meter_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest.framework.schemas.framework_otel_meter_schema import OpenTelemetryMeterSchema 6 | from nv_ingest_api.internal.schemas.message_brokers.message_broker_client_schema import MessageBrokerClientSchema 7 | 8 | 9 | def test_otel_meter_schema_defaults(): 10 | schema = OpenTelemetryMeterSchema() 11 | assert isinstance( 12 | schema.broker_client, MessageBrokerClientSchema 13 | ), "broker_client should be an instance of MessageBrokerClientSchema." 14 | assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False." 15 | 16 | 17 | def test_otel_meter_schema_custom_values(): 18 | custom_redis_client = MessageBrokerClientSchema(host="custom_host", port=12345, broker_params={"use_ssl": True}) 19 | schema = OpenTelemetryMeterSchema(broker_client=custom_redis_client, raise_on_failure=True) 20 | 21 | assert schema.broker_client.host == "custom_host", "Custom host value for redis_client should be respected." 22 | assert schema.broker_client.port == 12345, "Custom port value for redis_client should be respected." 23 | assert ( 24 | schema.broker_client.broker_params["use_ssl"] is True 25 | ), "Custom use_ssl value for broker_client should be True." 26 | assert schema.raise_on_failure is True, "Custom value for raise_on_failure should be respected." 27 | -------------------------------------------------------------------------------- /tests/service_tests/schemas/test_otel_tracer_schema.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema 6 | 7 | 8 | def test_otel_tracer_schema_defaults(): 9 | schema = OpenTelemetryTracerSchema() 10 | assert schema.raise_on_failure is False, "Default value for raise_on_failure should be False." 11 | 12 | 13 | def test_otel_tracer_schema_custom_values(): 14 | schema = OpenTelemetryTracerSchema(raise_on_failure=True) 15 | assert schema.raise_on_failure is True, "Custom value for raise_on_failure should be respected." 16 | -------------------------------------------------------------------------------- /tests/service_tests/stages/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/util/flow_control/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/util/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /tests/service_tests/util/telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | --------------------------------------------------------------------------------