├── .dockerignore ├── .env ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── actions │ └── greplint │ │ └── action.yml └── workflows │ ├── codeql.yml │ ├── dependency-review.yml │ ├── doclint.yml │ ├── draft_release.yml │ ├── linting.yml │ ├── pypi_release.yml │ ├── remote-integ-tests.yml │ └── testing.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── apps ├── Makefile ├── docker-base │ ├── Dockerfile.buildx │ ├── Makefile.docker-base │ ├── gen-pyproject-copy.sh │ └── poetry-install.sh ├── git │ └── git-credential-from-env.py ├── integration │ ├── README.md │ ├── integration │ │ ├── __init__.py │ │ ├── automation │ │ │ ├── __init__.py │ │ │ └── runtests.sh │ │ ├── conftest.py │ │ ├── containers │ │ │ ├── __init__.py │ │ │ ├── running.py │ │ │ └── stack.py │ │ ├── ingests │ │ │ ├── __init__.py │ │ │ ├── crawler.py │ │ │ ├── index.py │ │ │ ├── index_info.py │ │ │ └── jupyter.py │ │ ├── queries │ │ │ ├── __init__.py │ │ │ ├── opensearch.py │ │ │ ├── options.py │ │ │ └── queries.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_basic.py │ ├── poetry.lock │ └── pyproject.toml ├── jupyter │ ├── Dockerfile.buildx │ ├── Makefile.jupyter │ ├── README.md │ ├── bind_dir │ │ ├── BIND_MOUNT_BETWEEN_DOCKER_AND_HOST │ │ └── setup.sh │ ├── entrypoint.py │ ├── poetry.lock │ ├── profile │ ├── pyproject.toml │ ├── run-jupyter.sh │ └── sudoers ├── opensearch │ ├── 2.11 │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── opensearch.yml │ │ └── sycamore-opensearch.sh │ ├── Dockerfile │ ├── README.md │ ├── authority.pem │ ├── config.yml │ ├── opensearch.yml │ ├── roles_mapping.yml │ ├── setup_models.py │ └── sycamore-opensearch.sh ├── remote-processor-service │ ├── Dockerfile.build │ ├── Dockerfile.buildx │ ├── Makefile │ ├── README.md │ ├── config │ │ └── pipelines.yml │ ├── poetry.lock │ ├── pyproject.toml │ └── remote_processor_service │ │ ├── cli.py │ │ └── rps_docker_entrypoint.sh └── timetrace │ ├── Makefile │ ├── README.md │ ├── ttanal │ ├── ttcat │ └── ttviz.cpp ├── autogen-groups.py ├── compose.yaml ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ ├── images │ ├── ArynArchitecture_APS+Sycamorev2.png │ ├── SycamoreDataflowDiagramv2.png │ ├── SycamoreDiagram2.png │ ├── favicon.ico │ ├── query_execution.svg │ └── sycamore_logo.svg │ ├── index.rst │ └── sycamore │ ├── APIs.rst │ ├── APIs │ ├── context.rst │ ├── docset.rst │ ├── docsetreader.rst │ ├── docsetwriter.rst │ ├── document.rst │ ├── functions.rst │ ├── gen │ ├── llm.rst │ ├── low_level_transforms.rst │ ├── low_level_transforms │ │ ├── assign_doc_properties.rst │ │ ├── augment_text.rst │ │ ├── basics.rst │ │ ├── bbox_merge.rst │ │ ├── detr_partitioner.rst │ │ ├── embed.rst │ │ ├── explode.rst │ │ ├── extract_entity.rst │ │ ├── extract_schema.rst │ │ ├── extract_table.rst │ │ ├── extract_table_properties.rst │ │ ├── llm_map.rst │ │ ├── llm_query.rst │ │ ├── map.rst │ │ ├── mark_misc.rst │ │ ├── merge_elements.rst │ │ ├── partition.rst │ │ ├── query.rst │ │ ├── random_sample.rst │ │ ├── regex_replace.rst │ │ ├── sketcher.rst │ │ ├── split_elements.rst │ │ ├── spread_properties.rst │ │ ├── standardizer.rst │ │ ├── summarize.rst │ │ └── summarize_images.rst │ ├── node.rst │ ├── prompts.rst │ └── query.rst │ ├── connectors.rst │ ├── connectors │ ├── duckdb.md │ ├── elasticsearch.md │ ├── neo4j.md │ ├── opensearch.md │ ├── pinecone.md │ ├── qdrant.md │ └── weaviate.md │ ├── get_started.rst │ ├── get_started │ ├── ai_configuration.md │ ├── concepts.md │ └── hardware.md │ ├── query.rst │ ├── querying_data │ └── using_aryn_opensearch_stack │ │ ├── APIs │ │ ├── conversation_memory.rst │ │ ├── conversation_memory │ │ │ └── functions.md │ │ └── gen │ │ ├── architecture.md │ │ ├── conversation_memory │ │ ├── imgs │ │ │ ├── ConversationMemoryMultiAgent.jpg │ │ │ ├── ConversationMemoryMultiAgent.png │ │ │ └── resource-diagram.png │ │ ├── overview.md │ │ ├── storage_for_genai_agents.md │ │ └── using_with_conversational_search.md │ │ ├── dedup.md │ │ ├── demo_query_ui.md │ │ ├── encryption.md │ │ ├── hybrid_search.md │ │ ├── imgs │ │ ├── pipeline-architecture.png │ │ └── xlarge_DemoUI_FollowUpQuestion.png │ │ ├── integrate_your_application.md │ │ ├── launching_with_Docker.md │ │ ├── load_data.md │ │ ├── remote_processors.md │ │ ├── reranking.md │ │ ├── running_a_data_preparation_job.md │ │ ├── using_jupyter_container.md │ │ └── using_rag_pipelines.md │ ├── transforms.rst │ ├── transforms │ ├── embed.md │ ├── explode.md │ ├── extract_entity.md │ ├── extract_schema.md │ ├── filter.md │ ├── flatmap.md │ ├── llm_query.md │ ├── map.md │ ├── map_batch.md │ ├── materialize.md │ ├── merge.md │ ├── partition.md │ ├── sketch.md │ └── summarize.md │ ├── tutorials.rst │ ├── tutorials │ ├── conversational_memory_with_langchain.md │ ├── etl_for_opensearch.md │ ├── etl_for_weaviate_tutorial.md │ ├── etl_pinecone_tutorial.md │ └── sycamore_jupyter_dev_example.md │ └── using_jupyter.md ├── examples ├── bench.py ├── html_ingest.py ├── markdown.py ├── ndd_debug.py ├── query │ ├── ntsb_loader.py │ ├── ntsb_loader_materialized.py │ └── simple_ntsb.py ├── s3_ingest.py ├── simple_config.py ├── simple_duckdb.py ├── simple_ingest.py ├── simple_neo4j.py ├── simple_pinecone.py ├── simple_qdrant.py └── simple_weaviate.py ├── lib ├── import_timer │ ├── README.md │ ├── import_timer.py │ └── pyproject.toml ├── poetry-lock │ ├── README.md │ ├── poetry-lock-all.sh │ ├── poetry.lock │ ├── pyproject.toml │ └── sycamore_poetry_lock │ │ └── noop.py ├── remote-processors │ ├── Makefile │ ├── README.md │ ├── img │ │ └── RPS_Architecture.svg │ ├── poetry.lock │ ├── pyproject.toml │ └── remote_processors │ │ ├── __init__.py │ │ ├── processors │ │ ├── __init__.py │ │ ├── debug_processor.py │ │ ├── dedup_processor.py │ │ └── processor.py │ │ ├── search_request.py │ │ ├── search_response.py │ │ ├── server │ │ ├── __init__.py │ │ ├── pipeline.py │ │ ├── processor_registry.py │ │ └── remote_processor_service.py │ │ └── test │ │ ├── __init__.py │ │ ├── integration │ │ ├── conftest.py │ │ └── test_integ_debug.py │ │ ├── resources │ │ ├── configs │ │ │ ├── malformed │ │ │ │ ├── dupe_pipeline_names.yml │ │ │ │ ├── not_a_list.yml │ │ │ │ ├── pipeline_not_a_map.yml │ │ │ │ └── pipeline_with_many_keys.yml │ │ │ └── valid.yml │ │ └── sb_processed.jsonl │ │ ├── unit │ │ ├── __init__.py │ │ ├── processors │ │ │ ├── __init__.py │ │ │ ├── test_debug.py │ │ │ ├── test_dedup.py │ │ │ └── test_library.py │ │ ├── service │ │ │ ├── __init__.py │ │ │ ├── test_pipeline.py │ │ │ ├── test_processor_registry.py │ │ │ └── test_remote_processor_service.py │ │ └── test_base.py │ │ └── utils.py └── sycamore │ ├── README.md │ ├── poetry.lock │ ├── pyproject.toml │ └── sycamore │ ├── README.md │ ├── __init__.py │ ├── connectors │ ├── aryn │ │ ├── ArynReader.py │ │ ├── ArynWriter.py │ │ └── client.py │ ├── base_reader.py │ ├── base_writer.py │ ├── common.py │ ├── doc_reconstruct.py │ ├── duckdb │ │ ├── __init__.py │ │ ├── duckdb_reader.py │ │ └── duckdb_writer.py │ ├── elasticsearch │ │ ├── __init__.py │ │ ├── elasticsearch_reader.py │ │ └── elasticsearch_writer.py │ ├── file │ │ ├── __init__.py │ │ ├── file_scan.py │ │ ├── file_writer.py │ │ ├── file_writer_ray.py │ │ └── materialized_scan.py │ ├── neo4j │ │ ├── __init__.py │ │ └── neo4j_writer.py │ ├── opensearch │ │ ├── __init__.py │ │ ├── opensearch_reader.py │ │ ├── opensearch_writer.py │ │ └── utils.py │ ├── pinecone │ │ ├── __init__.py │ │ ├── pinecone_reader.py │ │ └── pinecone_writer.py │ ├── qdrant │ │ ├── __init__.py │ │ ├── qdrant_reader.py │ │ └── qdrant_writer.py │ └── weaviate │ │ ├── __init__.py │ │ ├── weaviate_reader.py │ │ └── weaviate_writer.py │ ├── context.py │ ├── data │ ├── __init__.py │ ├── bbox.py │ ├── docid.py │ ├── document.py │ ├── element.py │ ├── metadata.py │ └── table.py │ ├── decorators.py │ ├── docset.py │ ├── evaluation │ ├── __init__.py │ ├── data.py │ ├── datasets.py │ ├── evaluate.py │ ├── metrics │ │ ├── __init__.py │ │ ├── generated_answer.py │ │ └── retrieval.py │ ├── ocr │ │ ├── __main__.py │ │ ├── data.py │ │ ├── llm_ocr.py │ │ ├── metrics.py │ │ └── models.py │ ├── pipeline.py │ └── subtasks.py │ ├── executor.py │ ├── functions │ ├── __init__.py │ ├── basic_filters.py │ ├── chunker.py │ ├── document.py │ ├── elements.py │ ├── rabin_karp.py │ ├── simhash.py │ └── tokenizer.py │ ├── grouped_data.py │ ├── llms │ ├── __init__.py │ ├── anthropic.py │ ├── bedrock.py │ ├── config.py │ ├── gemini.py │ ├── llms.py │ ├── openai.py │ └── prompts │ │ ├── __init__.py │ │ ├── default_prompts.py │ │ ├── jinja_fragments.py │ │ └── prompts.py │ ├── materialize.py │ ├── materialize_config.py │ ├── plan_nodes.py │ ├── query │ ├── README.md │ ├── __init__.py │ ├── client.py │ ├── execution │ │ ├── __init__.py │ │ ├── aggregation.py │ │ ├── metrics.py │ │ ├── operations.py │ │ ├── physical_operator.py │ │ ├── sycamore_executor.py │ │ └── sycamore_operator.py │ ├── logical_plan.py │ ├── operators │ │ ├── basic_filter.py │ │ ├── clustering.py │ │ ├── count.py │ │ ├── field_in.py │ │ ├── groupby.py │ │ ├── limit.py │ │ ├── llm_extract_entity.py │ │ ├── llm_filter.py │ │ ├── math.py │ │ ├── query_database.py │ │ ├── sort.py │ │ ├── summarize_data.py │ │ ├── top_k.py │ │ └── unroll.py │ ├── planner.py │ ├── planner_prompt.py │ ├── result.py │ ├── schema.py │ └── strategy.py │ ├── reader.py │ ├── rules │ ├── __init__.py │ └── optimize_resource_args.py │ ├── schema.py │ ├── tests │ ├── README.md │ ├── __init__.py │ ├── config.py │ ├── conftest.py │ ├── integration │ │ ├── connectors │ │ │ ├── aryn │ │ │ │ ├── test_aryn_reader.py │ │ │ │ ├── test_aryn_writer.py │ │ │ │ └── test_client.py │ │ │ ├── common.py │ │ │ ├── duckdb │ │ │ │ ├── test_duckdb_read.py │ │ │ │ └── test_pdf_to_duckdb.py │ │ │ ├── elasticsearch │ │ │ │ ├── test_elasticsearch_read.py │ │ │ │ └── test_pdf_to_elasticsearch.py │ │ │ ├── file │ │ │ │ ├── test_file_writer.py │ │ │ │ └── test_file_writer_to_s3.py │ │ │ ├── neo4j │ │ │ │ ├── test_docset_to_neo4j.py │ │ │ │ └── test_neo4j_writer_methods.py │ │ │ ├── opensearch │ │ │ │ ├── test_html_to_opensearch.py │ │ │ │ ├── test_opensearch_read.py │ │ │ │ └── test_pdf_to_opensearch.py │ │ │ ├── pinecone │ │ │ │ ├── test_pdf_to_pinecone.py │ │ │ │ └── test_pinecone_read.py │ │ │ ├── qdrant │ │ │ │ └── test_qdrant.py │ │ │ └── weaviate │ │ │ │ ├── test_pdf_to_weaviate.py │ │ │ │ └── test_weaviate_read.py │ │ ├── evaluation │ │ │ ├── test_datasets.py │ │ │ ├── test_evaluate.py │ │ │ └── test_pipeline.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── test_document.py │ │ ├── llms │ │ │ ├── test_anthropic.py │ │ │ ├── test_bedrock.py │ │ │ ├── test_gemini.py │ │ │ └── test_openai.py │ │ ├── query │ │ │ ├── conftest.py │ │ │ ├── execution │ │ │ │ ├── test_operations.py │ │ │ │ └── test_sycamore_query.py │ │ │ ├── test_planner.py │ │ │ └── test_query_opensearch.py │ │ ├── test_docset.py │ │ ├── test_executor.py │ │ ├── test_image_utils.py │ │ ├── test_materialize.py │ │ ├── textractor │ │ │ └── test_textractor.py │ │ └── transforms │ │ │ ├── test_base.py │ │ │ ├── test_data_extraction.py │ │ │ ├── test_embed.py │ │ │ ├── test_llm_filter.py │ │ │ ├── test_map.py │ │ │ ├── test_partition.py │ │ │ ├── test_random_sample.py │ │ │ ├── test_rerank.py │ │ │ ├── test_sort.py │ │ │ ├── test_summarize_images.py │ │ │ └── test_table_extraction.py │ ├── manual │ │ ├── pdf2image_memusage.py │ │ └── test_fast_sycamore_import.py │ ├── pytest.ini │ ├── resources │ │ ├── data │ │ │ ├── docx │ │ │ │ └── aryn_website_sample.docx │ │ │ ├── htmls │ │ │ │ └── wikipedia_binary_search.html │ │ │ ├── imgs │ │ │ │ └── sample-detr-image.png │ │ │ ├── json │ │ │ │ ├── example.json │ │ │ │ ├── model_server_output_transformer.json │ │ │ │ └── model_server_output_transformer_extract_tables.json │ │ │ ├── materialize │ │ │ │ ├── json_writer │ │ │ │ │ ├── 3fe9913e-60e2-11ef-90e5-e40d36f1e1ae.pickle │ │ │ │ │ ├── materialize.success │ │ │ │ │ └── md-9e6e68ee-ad8e-4e39-a2e1-7ef5befc588c.pickle │ │ │ │ └── llmfilter-ntsb-temp │ │ │ │ │ ├── doc-f-rhcfgmzrgifspjxjnl8vhh8.4fd48370db59b408b2700abd89bfe92e43009fde4ec216cfd112cdf17b7dfb35.pickle │ │ │ │ │ ├── materialize.clean │ │ │ │ │ ├── materialize.success │ │ │ │ │ ├── md-d-07bgl12pc0intnh2y74po4c.a98b82d885005500fb664fb18283cf80864de0fb6116e5acd9c3060dd91086a7.pickle │ │ │ │ │ ├── md-d-6wi2aqr0b504zojkdqnyybf.97ae3a37a86700c4601f89551892eb79faeaa0c9afe66f26ee20d8b7f996f929.pickle │ │ │ │ │ ├── md-d-75qp993ysz87aa1c1tsc5o0.0db762bad4ffb81bdde0f754087a65594f0cc8001986729f6ded86fa7ca7803e.pickle │ │ │ │ │ ├── md-d-89ww3vta189zsw6ac4vsjem.253ac204a7d81cc730b4a90202afaf3f78dbf27c87a17bcb94a23212b159cce6.pickle │ │ │ │ │ ├── md-d-a8pg30d1wqbff3zsf6c4l0w.c75fd5cba213cc7888ffe30f52d9fb7b637acd227a728b07ee7e1c50545c884e.pickle │ │ │ │ │ ├── md-d-ax65fkm5dy3cmkxtcfu6dv6.219182e0864996b3bb9855bd027940ff5989729f1e3ae80db8b2574f64eceb89.pickle │ │ │ │ │ ├── md-d-bx7u8xo31r49lnxi7r3thdr.7d4ab219ffa3c8c219fdcef0f384cac80e8e692a5669da4712f9076489aade58.pickle │ │ │ │ │ ├── md-d-g9dk50t6tir9sxomqvxpwhh.6f10891f3f44b8376061256d2a3d56b91945e3082abec53a5e8f1d3394492a37.pickle │ │ │ │ │ ├── md-d-k85dhbsu6n4rtp1vd62vkik.ef6d0f2992ef663296ab39f158b0a648f8bb9f4b679cfffc7df262ce49ef61e8.pickle │ │ │ │ │ ├── md-d-kl9t1to3b6t73hhclt9ke6q.26bb7d98dbcd581fa0873ea58b4cca247691b0aff527886dd69c56c407e4de6e.pickle │ │ │ │ │ ├── md-d-m3vkixw9pdmcbrfs3g6hhhf.2cb8ce11ce5feed99eacd754f8279868824d3266a59feaf92325a33a305bedc0.pickle │ │ │ │ │ ├── md-d-ml5ks4t608vrlz00gjk4fum.8d4b49675fa3c7236c71c93dc2fafd1eb95690c74a18d1594f588e953366887a.pickle │ │ │ │ │ ├── md-d-n3oawwwjt1hxatavnpjlbjb.7b9fa911fb9108ed58b8f80ce54106f0c4e98c87299a4df6519e0e9ebf142b07.pickle │ │ │ │ │ ├── md-d-poffmrto89o7t642owz0uqd.d864e9972c347cbc8900c223ff27a67815bf4cff87e812130969e73ae840ced1.pickle │ │ │ │ │ ├── md-d-qasv594a8qoyh0pwlmhljih.f3dc469668aade55c348445e4cbdfe9cec1c205e24506ee4cfc1124bad05dabb.pickle │ │ │ │ │ ├── md-d-rve5tvzjsb8qh6hcwdiq6qm.1022022f6d1712c26c86b3d4bc9152aeebc39806b9ea36285b5f3135cc53d672.pickle │ │ │ │ │ ├── md-d-swwvf71e2zvm5due6ongmjx.db0edaa58fcb926c43f191eca5b35e24721662924f3a3f28e544c49ed06c0fc4.pickle │ │ │ │ │ ├── md-d-u2q5b6b3k1liz40fdy8ibi9.a1761cb4fab284b222950d77e6deda85b44a607bcc588b9f38bff80d97b2a6dc.pickle │ │ │ │ │ ├── md-d-utp720v7v9ufcvce5tro2su.921a82b7ea58fbda1f36ce3aa828ef96efb10836a04560a6e4739b3182db4820.pickle │ │ │ │ │ ├── md-d-xjnfbj6qqys50k9pi988awd.fb4d50848f84e0f145c1d091aee688dbae19b8ef420ac7b4158042785fe24578.pickle │ │ │ │ │ └── md-d-zd8xb8wbpv6nit9wmujsjpi.d24c185d792566b7731a2d8aad7dc5cca24c4a1ac9fcd814f96fa9d7d0087a53.pickle │ │ │ ├── nested_json │ │ │ │ └── example.json │ │ │ ├── ocr_pdfs │ │ │ │ └── test_simple_ocr.pdf │ │ │ ├── pdfs │ │ │ │ ├── Ray.pdf │ │ │ │ ├── Ray_page1.pdf │ │ │ │ ├── Ray_page11.pdf │ │ │ │ ├── Transformer.pdf │ │ │ │ ├── basic_table.pdf │ │ │ │ ├── doctor_testimonial.pdf │ │ │ │ ├── ntsb-report.pdf │ │ │ │ ├── ntsb0.pdf │ │ │ │ ├── ntsb1.pdf │ │ │ │ ├── ntsb3.pdf │ │ │ │ └── visit_aryn.pdf │ │ │ ├── pptx │ │ │ │ └── design.pptx │ │ │ └── texts │ │ │ │ └── Ray.txt │ │ └── objects │ │ │ └── weaviate │ │ │ └── collection_params_b.pickle │ └── unit │ │ ├── __init__.py │ │ ├── connectors │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── test_base_reader.py │ │ │ ├── test_base_writer.py │ │ │ └── test_common.py │ │ ├── duckdb │ │ │ ├── test_duckdb_reader.py │ │ │ └── test_duckdb_writer.py │ │ ├── elasticsearch │ │ │ ├── test_elasticsearch_reader.py │ │ │ └── test_elasticsearch_writer.py │ │ ├── file │ │ │ └── test_file_writer.py │ │ ├── opensearch │ │ │ └── test_opensearch.py │ │ ├── pinecone │ │ │ ├── test_pinecone_reader.py │ │ │ └── test_pinecone_writer.py │ │ ├── qdrant │ │ │ └── test_qdrant_reader.py │ │ └── weaviate │ │ │ ├── test_weaviate_reader.py │ │ │ └── test_weaviate_writer.py │ │ ├── data │ │ ├── test_bbox.py │ │ ├── test_docid.py │ │ ├── test_document.py │ │ ├── test_element.py │ │ └── test_table.py │ │ ├── evaluation │ │ ├── __init__.py │ │ ├── test_evaluate.py │ │ └── test_metrics.py │ │ ├── functions │ │ ├── __init__.py │ │ ├── test_elements.py │ │ ├── test_field_to_value.py │ │ ├── test_rabin_karp.py │ │ ├── test_simhash.py │ │ ├── test_text_chunker.py │ │ └── test_tokenizer.py │ │ ├── inmempyarrowfs.py │ │ ├── llms │ │ ├── prompts │ │ │ └── test_prompts.py │ │ ├── test_bedrock.py │ │ └── test_llms.py │ │ ├── query │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── execution │ │ │ ├── __init__.py │ │ │ ├── test_sycamore_executor.py │ │ │ └── test_sycamore_operator.py │ │ ├── test_node.py │ │ ├── test_operations.py │ │ ├── test_plan.py │ │ ├── test_planner.py │ │ ├── test_result.py │ │ ├── test_schema.py │ │ └── test_strategy.py │ │ ├── scans │ │ ├── __init__.py │ │ ├── test_file_scan.py │ │ └── test_materialized_scan.py │ │ ├── test_context.py │ │ ├── test_docset.py │ │ ├── test_executor.py │ │ ├── test_grouped_data.py │ │ ├── test_import_speed.py │ │ ├── test_materialize.py │ │ ├── test_node.py │ │ ├── test_reader.py │ │ ├── test_rewriter.py │ │ ├── test_writer.py │ │ ├── transforms │ │ ├── __init__.py │ │ ├── check_partition_impl.py │ │ ├── test_aryn_partitioner.py │ │ ├── test_assign_doc_properties.py │ │ ├── test_augment_text.py │ │ ├── test_base.py │ │ ├── test_base_llm.py │ │ ├── test_basics.py │ │ ├── test_bbox_merge.py │ │ ├── test_clustering.py │ │ ├── test_detr_partitioner.py │ │ ├── test_embed.py │ │ ├── test_explode.py │ │ ├── test_extract_document_structure.py │ │ ├── test_extract_entity.py │ │ ├── test_extract_table_properties.py │ │ ├── test_graph_entity_extractor.py │ │ ├── test_graph_relationship_extractor.py │ │ ├── test_llm_filter.py │ │ ├── test_llm_query.py │ │ ├── test_mapping.py │ │ ├── test_merge_elements.py │ │ ├── test_partition.py │ │ ├── test_random_sample.py │ │ ├── test_regex_replace.py │ │ ├── test_resolve_graph_entities.py │ │ ├── test_schema.py │ │ ├── test_similarity.py │ │ ├── test_sketcher.py │ │ ├── test_sort.py │ │ ├── test_split_elements.py │ │ ├── test_spread_properties.py │ │ ├── test_standardizer.py │ │ ├── test_summarize.py │ │ ├── test_summarize_images.py │ │ ├── test_table_extractor.py │ │ ├── test_table_transfomers.py │ │ ├── test_term_frequency.py │ │ └── text_extraction │ │ │ └── test_ocr_models.py │ │ └── utils │ │ ├── test_bbox_sort.py │ │ ├── test_cache.py │ │ ├── test_deep_eq.py │ │ ├── test_extract_json.py │ │ ├── test_fileformat_tools.py │ │ ├── test_import_utils.py │ │ ├── test_jupyter.py │ │ ├── test_markdown.py │ │ ├── test_nested.py │ │ ├── test_pdf_utils.py │ │ ├── test_pydantic_pickling.py │ │ ├── test_ray_utils.py │ │ ├── test_similarity.py │ │ ├── test_sycamore_logger.py │ │ ├── test_time_trace.py │ │ └── test_xycut_sort.py │ ├── transforms │ ├── __init__.py │ ├── assign_doc_properties.py │ ├── augment_text.py │ ├── base.py │ ├── base_llm.py │ ├── basics.py │ ├── bbox_merge.py │ ├── clustering.py │ ├── dataset_scan.py │ ├── detr_partitioner.py │ ├── detr_partitioner_config.py │ ├── embed.py │ ├── explode.py │ ├── extract_document_structure.py │ ├── extract_entity.py │ ├── extract_graph_entities.py │ ├── extract_graph_relationships.py │ ├── extract_schema.py │ ├── extract_table.py │ ├── extract_table_properties.py │ ├── groupby_count.py │ ├── llm_filter.py │ ├── llm_query.py │ ├── map.py │ ├── mark_misc.py │ ├── markdown.py │ ├── merge_elements.py │ ├── partition.py │ ├── query.py │ ├── random_sample.py │ ├── regex_replace.py │ ├── resolve_graph_entities.py │ ├── similarity.py │ ├── sketcher.py │ ├── sort.py │ ├── split_elements.py │ ├── spread_properties.py │ ├── standardizer.py │ ├── summarize.py │ ├── summarize_images.py │ ├── table_structure │ │ ├── extract.py │ │ └── table_transformers.py │ ├── term_frequency.py │ └── text_extraction │ │ ├── __init__.py │ │ ├── ocr_models.py │ │ ├── pdf_miner.py │ │ └── text_extractor.py │ ├── utils │ ├── __init__.py │ ├── aryn_config.py │ ├── bbox_sort.py │ ├── cache.py │ ├── deep_eq.py │ ├── deprecate.py │ ├── element_sort.py │ ├── extract_json.py │ ├── fileformat_tools.py │ ├── html_utils.py │ ├── http.py │ ├── image_utils.py │ ├── import_utils.py │ ├── jupyter.py │ ├── lineage_utils.py │ ├── llm_utils.py │ ├── markdown.py │ ├── memory_debugging.py │ ├── merge_utils.py │ ├── model_load.py │ ├── nested.py │ ├── pdf.py │ ├── pdf_utils.py │ ├── pickle_pydantic.py │ ├── pyarrow.py │ ├── pytorch_dir.py │ ├── ray_utils.py │ ├── similarity.py │ ├── sycamore_logger.py │ ├── thread_local.py │ ├── time_trace.py │ └── xycut.py │ └── writer.py ├── notebooks ├── ArynPartitionerExample.ipynb ├── ArynPartitionerPython.ipynb ├── ArynPartitionerWithLangchain.ipynb ├── EBGaramond-Bold.ttf ├── OpenAI-logprob.ipynb ├── VisualizePartitioner.ipynb ├── aryn-opensearch-bedrock-rag-example.ipynb ├── default-prep-script.ipynb ├── docprep │ ├── minilm-l6-v2_greedy-section-merger_duckdb.ipynb │ ├── minilm-l6-v2_greedy-section-merger_opensearch.ipynb │ ├── minilm-l6-v2_greedy-text-element-merger_duckdb.ipynb │ ├── minilm-l6-v2_marked-merger_duckdb.ipynb │ ├── text-embedding-3-small_greedy-section-merger_duckdb.ipynb │ ├── text-embedding-3-small_greedy-section-merger_pinecone.ipynb │ ├── text-embedding-3-small_greedy-text-element-merger_opensearch.ipynb │ └── text-embedding-3-small_marked-merger_pinecone.ipynb ├── duckdb-writer.ipynb ├── earnings_calls_notebooks │ ├── workshop_nb_0.ipynb │ ├── workshop_nb_1.ipynb │ ├── workshop_nb_2.ipynb │ ├── workshop_nb_3.ipynb │ └── workshop_nb_4.ipynb ├── elasticsearch-writer.ipynb ├── financial-docs-10k-example.ipynb ├── jupyter_dev_example.ipynb ├── metadata-extraction.ipynb ├── ndd_example.ipynb ├── ntsb-demo.ipynb ├── opensearch-writer.ipynb ├── opensearch_docs_etl.ipynb ├── pinecone-writer.ipynb ├── query-demo.ipynb ├── run-notebook-tests.sh ├── subtask-sample.ipynb ├── sycamore-tutorial-intermediate-etl.ipynb ├── sycamore_demo.ipynb ├── tutorial.ipynb ├── unpickle_query.ipynb └── weaviate-writer.ipynb ├── poetry.lock └── pyproject.toml /.dockerignore: -------------------------------------------------------------------------------- 1 | .scrapy 2 | .ruff_cache 3 | .mypy_cache 4 | .pytest_cache 5 | downloads 6 | **/*~ 7 | **/.venv 8 | cache_dir 9 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | VERSION=stable 2 | UI_VERSION=${VERSION} 3 | OPENSEARCH_VERSION=${VERSION} 4 | SYCAMORE_VERSION=${VERSION} 5 | SYCAMORE_CRAWLER_HTTP_VERSION=${VERSION} 6 | SYCAMORE_CRAWLER_S3_VERSION=${VERSION} 7 | JUPYTER_VERSION=${VERSION} 8 | RPS_VERSION=${VERSION} 9 | QUERY_UI_VERSION=${VERSION} 10 | 11 | UI_PORT=3000 12 | OPENSEARCH_PORT=9200 13 | RAY_CONSOLE_PORT=8265 14 | JUPYTER_PORT=8888 15 | RPS_PORT=2796 16 | QUERY_UI_PORT=8501 17 | SSL=0 18 | 19 | JUPYTER_BIND_DIR=./apps/jupyter/bind_dir 20 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | sycamore/tests/resources/data/** -linguist-detectable 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/actions/greplint/action.yml: -------------------------------------------------------------------------------- 1 | name: Grep Lint 2 | description: Checks the docs for unwanted strings 3 | 4 | branding: 5 | icon: eye 6 | color: blue 7 | 8 | runs: 9 | using: composite 10 | steps: 11 | - name: Find notion.so references 12 | run: | 13 | set +e 14 | find . \( -path ./.github -o -path ./lib/sycamore/sycamore/tests \) -prune -o -type f \( -iname '*.md' -o -iname '*.rst' -o -iname '*.html' \) -print0 | xargs -r0 grep -i 'notion[.]so/' >> "${GITHUB_OUTPUT}" 15 | exit $(wc -l < "${GITHUB_OUTPUT}") 16 | shell: bash 17 | -------------------------------------------------------------------------------- /.github/workflows/dependency-review.yml: -------------------------------------------------------------------------------- 1 | # Dependency Review Action 2 | # 3 | # This Action will scan dependency manifest files that change as part of a Pull Request, 4 | # surfacing known-vulnerable versions of the packages declared or updated in the PR. 5 | # Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable 6 | # packages will be blocked from merging. 7 | # 8 | # Source repository: https://github.com/actions/dependency-review-action 9 | # Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement 10 | name: 'Dependency review' 11 | on: 12 | pull_request: 13 | branches: [ "main" ] 14 | 15 | permissions: 16 | contents: read 17 | pull-requests: write 18 | 19 | jobs: 20 | dependency-review: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: 'Checkout repository' 24 | uses: actions/checkout@v4 25 | - name: 'Dependency Review' 26 | uses: actions/dependency-review-action@v4 27 | with: 28 | comment-summary-in-pr: on-failure 29 | -------------------------------------------------------------------------------- /.github/workflows/doclint.yml: -------------------------------------------------------------------------------- 1 | name: Doc Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | doclint: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v4 16 | 17 | - name: Grep Lint 18 | uses: ./.github/actions/greplint 19 | -------------------------------------------------------------------------------- /.github/workflows/draft_release.yml: -------------------------------------------------------------------------------- 1 | name: Draft release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | draft-release: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | - name: Install poetry 18 | run: pipx install poetry 19 | - uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.11' 22 | cache: 'poetry' 23 | - name: Install 24 | run: poetry install 25 | working-directory: lib/sycamore 26 | - name: Copy README 27 | working-directory: lib/sycamore 28 | run: cp ../../README.md README.md 29 | - name: build 30 | run: poetry build 31 | working-directory: lib/sycamore 32 | - name: Release 33 | uses: softprops/action-gh-release@v1 34 | with: 35 | draft: true 36 | generate_release_notes: true 37 | files: | 38 | lib/sycamore/dist/*.tar.gz 39 | -------------------------------------------------------------------------------- /.github/workflows/pypi_release.yml: -------------------------------------------------------------------------------- 1 | on: 2 | release: 3 | types: [published] 4 | 5 | jobs: 6 | test-pypi-release: 7 | name: Upload to PyPI 8 | runs-on: ubuntu-latest 9 | environment: 10 | name: PyPI 11 | url: https://pypi.org/project/sycamore-ai/ 12 | permissions: 13 | id-token: write 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | - name: Install poetry 18 | run: pipx install poetry 19 | - uses: actions/setup-python@v4 20 | with: 21 | python-version: "3.11" 22 | cache: "poetry" 23 | - name: Install 24 | run: poetry install 25 | working-directory: lib/sycamore 26 | - name: build 27 | run: poetry build 28 | working-directory: lib/sycamore 29 | - name: Publish package distributions to PyPI 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | with: 32 | verbose: true 33 | packages-dir: lib/sycamore/dist 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *#*# 2 | *.#* 3 | *.iml 4 | *.ipr 5 | *.iws 6 | *.pyc 7 | *.pyo 8 | *.pyd 9 | *.swp 10 | *~ 11 | .DS_Store 12 | .cache 13 | .idea/ 14 | .idea_modules/ 15 | .mypy_cache 16 | .project 17 | .pydevproject 18 | .pytest_cache 19 | .ruff_cache 20 | .scrapy 21 | .settings 22 | .vscode 23 | venv/ 24 | python/dist/ 25 | __pycache__/ 26 | bin 27 | etc 28 | pyenv.cfg 29 | share 30 | tmp 31 | .ipynb_checkpoints 32 | pyvenv.cfg 33 | apps/jupyter/bind_dir/redirect.html 34 | .venv 35 | lib/remote-processors/remote_processors/*pb2* 36 | poetry.toml 37 | dist/ 38 | lib/aryn-sdk/dist/ 39 | lib/sycamore/dist/ 40 | luna_traces/ 41 | traces/ 42 | apps/jupyter/bind_dir/poetry_cache 43 | docs/build 44 | cache 45 | notebooks/default-prep-data 46 | .python-version 47 | apps/timetrace/ttviz 48 | notebooks/pc-tutorial 49 | notebooks/tmp 50 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "opensearch-remote-processor"] 2 | path = lib/remote-processors/opensearch-remote-processor 3 | url = git@github.com:aryn-ai/opensearch-remote-processor.git 4 | branch = 2.x 5 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | jobs: 14 | post_create_environment: 15 | # Install poetry 16 | - pip install poetry 17 | post_install: 18 | # Install sycamore with poetry (instead of pip) 19 | - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --extras "docs" --no-root 20 | 21 | # Build documentation in the "docs/" directory with Sphinx 22 | sphinx: 23 | configuration: docs/source/conf.py 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | help: 2 | @echo "make all -- make all in ${DIRS}" 3 | @echo "make clean -- make clean in ${DIRS}" 4 | @echo "make serve-docs -- serve the sycamore docs at http://localhost:8000/" 5 | 6 | DIRS := apps 7 | .PHONY: $(DIRS) 8 | 9 | all: $(DIRS:%=subdir-all-%) 10 | 11 | clean: $(DIRS:%=subdir-clean-%) 12 | 13 | subdir-all-%: 14 | $(MAKE) -C $* all 15 | 16 | subdir-clean-%: 17 | $(MAKE) -C $* clean 18 | 19 | serve-docs: 20 | (cd docs && make serve-docs) 21 | -------------------------------------------------------------------------------- /apps/Makefile: -------------------------------------------------------------------------------- 1 | DIRS := timetrace 2 | .PHONY: $(DIRS) 3 | 4 | all: $(DIRS:%=subdir-all-%) 5 | 6 | clean: $(DIRS:%=subdir-clean-%) 7 | 8 | subdir-all-%: 9 | $(MAKE) -C $* all 10 | 11 | subdir-clean-%: 12 | $(MAKE) -C $* clean 13 | -------------------------------------------------------------------------------- /apps/docker-base/Makefile.docker-base: -------------------------------------------------------------------------------- 1 | # -*- makefile -*- 2 | help: 3 | @echo "This should be run as part of the Dockerfile" 4 | false 5 | 6 | user-setup: 7 | groupadd --gid 1000 app 8 | useradd -d /app --uid 1000 --gid app app 9 | chown -R app:app /app 10 | 11 | apt-setup: 12 | rm -f /etc/apt/apt.conf.d/docker-clean 13 | echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache 14 | 15 | apt-install: 16 | DEBIAN_FRONTEND=noninteractive apt update 17 | # gcc and python3-dev needed on arm for guidance 18 | DEBIAN_FRONTEND=noninteractive apt -y install --no-install-recommends python3-poetry gcc python3-dev 19 | 20 | non-root-files-check: 21 | find . -uid 0 -ls 22 | test $$(find . -uid 0 -print | wc -w) = 0 23 | 24 | record-version: 25 | test "$(GIT_COMMIT)" != "" 26 | test "$(GIT_COMMIT)" != "unknown" 27 | touch .git.commit.$(GIT_COMMIT) 28 | 29 | # Allow images that depend on the docker base image to verify that the version for their 30 | # source code is consistent with the version in the base image. If the code is inconsistent, 31 | # the resulting image could behave unexpectedly. 32 | check-version-compatibility: 33 | test "$(GIT_COMMIT)" != "" 34 | test "$(GIT_COMMIT)" != "unknown" 35 | ls .git.commit.* 36 | test -f .git.commit.$(GIT_COMMIT) 37 | -------------------------------------------------------------------------------- /apps/docker-base/gen-pyproject-copy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ "$0" != "./apps/docker-base/gen-pyproject-copy.sh" ]]; then 3 | echo "Run me as ./apps/docker-base/gen-pyproject-copy.sh" 4 | exit 1 5 | fi 6 | echo "# Begin generated by $0" 7 | # lib/sycamore is copied in its entirety, so don't include here 8 | for i in $(find . -name pyproject.toml | fgrep -v .venv | fgrep -v lib/sycamore/pyproject.toml | fgrep -v './pyproject.toml' | sort); do 9 | dir="$(dirname "$i")" 10 | readme="${dir}/README.md" 11 | echo "COPY --link --chown=1000:1000 $i $readme ${dir}/" 12 | done 13 | echo "COPY --link --chown=app:app ./pyproject.toml ./poetry.lock ./README.md ./" 14 | echo "# End generated by $0" 15 | -------------------------------------------------------------------------------- /apps/docker-base/poetry-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | if [[ ! -d /app/.venv ]]; then 5 | echo "Missing /app/.venv; assuming this is the first poetry install" 6 | elif [[ $(grep -c VIRTUAL_ENV=./app/.venv .venv/bin/activate) != 1 ]]; then 7 | echo "ERROR: Broken venv" 8 | grep VIRTUAL_ENV .venv/bin/activate 9 | exit 1 10 | fi 11 | 12 | cd /app 13 | 14 | if [[ -d /app/work/bind_dir ]]; then 15 | # Running in a container. Cache to the bind dir for maximum preservation 16 | export POETRY_CACHE_DIR=/app/work/bind_dir/poetry_cache 17 | else 18 | # Running during a build. Use the build poetry cache. 19 | export POETRY_CACHE_DIR=/tmp/poetry_cache 20 | fi 21 | 22 | if [[ $# = 0 ]]; then 23 | echo "Usage: $0 [group1] [group2] [group3]" 24 | exit 1 25 | fi 26 | for i in "$@"; do 27 | if [[ "$i" == main ]]; then 28 | : 29 | elif [[ $(fgrep "[tool.poetry.group.${i}.dependencies]" pyproject.toml | wc -l) != 1 ]]; then 30 | echo "Unable to find dependencies for $i in pyproject.toml" 31 | exit 1 32 | fi 33 | done 34 | only=$(echo "$@" | sed 's/ /,/g') 35 | echo "$only" >>/app/.poetry.install 36 | poetry install --only $only --no-root -v 37 | 38 | -------------------------------------------------------------------------------- /apps/integration/README.md: -------------------------------------------------------------------------------- 1 | # Container Integration Testing for Sycamore 2 | 3 | To run these locally today: 4 | ```bash 5 | cd sycamore 6 | docker compose up reset 7 | docker compose up -d 8 | poetry run pytest apps/integration 9 | ``` 10 | -------------------------------------------------------------------------------- /apps/integration/integration/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | SYCAMORE_ROOT = Path(__file__).parent.parent.parent.parent 4 | 5 | __all__ = ["SYCAMORE_ROOT"] 6 | -------------------------------------------------------------------------------- /apps/integration/integration/automation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/apps/integration/integration/automation/__init__.py -------------------------------------------------------------------------------- /apps/integration/integration/conftest.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 2 | from integration.containers.running import container_handles, container_urls, opensearch_client 3 | from integration.ingests.index import ingested_index, ingest_profile 4 | from integration.queries.queries import DEFAULT_OPTIONS, QueryConfigGenerator, query_generator 5 | from integration.containers.stack import stack, tag 6 | 7 | 8 | QUERY_FIXTURE_NAME = "os_query" 9 | 10 | 11 | def pytest_addoption(parser): 12 | parser.addoption("--docker-tag", action="store", default="stable", help="docker tag to test") 13 | 14 | 15 | def pytest_generate_tests(metafunc): 16 | """ 17 | Generate all a test for every query configuration in the Query Config Generator 18 | """ 19 | if QUERY_FIXTURE_NAME in metafunc.fixturenames: 20 | metafunc.parametrize(QUERY_FIXTURE_NAME, list(QueryConfigGenerator(DEFAULT_OPTIONS))) 21 | -------------------------------------------------------------------------------- /apps/integration/integration/containers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/apps/integration/integration/containers/__init__.py -------------------------------------------------------------------------------- /apps/integration/integration/ingests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/apps/integration/integration/ingests/__init__.py -------------------------------------------------------------------------------- /apps/integration/integration/ingests/index_info.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class IndexInfo: 6 | name: str 7 | num_docs: int 8 | -------------------------------------------------------------------------------- /apps/integration/integration/ingests/jupyter.py: -------------------------------------------------------------------------------- 1 | from docker.models.containers import Container 2 | from opensearchpy import OpenSearch 3 | 4 | from integration.ingests.index_info import IndexInfo 5 | 6 | 7 | JUPYTER_NB_INFO = { 8 | "default-prep-script.ipynb": IndexInfo(name="demoindex0", num_docs=2), 9 | "jupyter_dev_example.ipynb": IndexInfo(name="local_development_example_index_withentity", num_docs=2), 10 | } 11 | 12 | 13 | class JupyterIndex: 14 | def __init__(self, nb_name: str, opensearch: OpenSearch, jupyter: Container): 15 | if nb_name not in JUPYTER_NB_INFO: 16 | raise ValueError(f"Unrecognized notebook name: {nb_name}") 17 | self._nb_name = nb_name 18 | self._opensearch = opensearch 19 | self._jupyter = jupyter 20 | self._index_info = JUPYTER_NB_INFO[nb_name] 21 | 22 | def __enter__(self): 23 | command = [ 24 | "poetry", 25 | "run", 26 | "jupyter", 27 | "nbconvert", 28 | "--execute", 29 | "--stdout", 30 | "--debug", 31 | "--to", 32 | "markdown", 33 | f"work/examples/{self._nb_name}", 34 | ] 35 | exc, logs = self._jupyter.exec_run(command, stream=True) 36 | for log in logs: 37 | print(log.decode("utf-8").rstrip()) 38 | return self._index_info 39 | 40 | def __exit__(self, exc_type, exc_val, exc_tb): 41 | self._opensearch.indices.delete(index=self._index_info.name) 42 | -------------------------------------------------------------------------------- /apps/integration/integration/queries/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/apps/integration/integration/queries/__init__.py -------------------------------------------------------------------------------- /apps/integration/integration/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/apps/integration/integration/tests/__init__.py -------------------------------------------------------------------------------- /apps/integration/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "integration" 3 | version = "0.1.0" 4 | description = "Integration Testing for Sycamore" 5 | authors = ["aryn.ai "] 6 | license = "Apache 2.0" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9.2,<3.13" 11 | docker = "^7.0.0" 12 | pytest = "7.4.0" 13 | opensearch-py = "^2.5.0" 14 | testcontainers = "^4.3.1" 15 | sycamore-poetry-lock = { path = "../../lib/poetry-lock", develop = true } 16 | 17 | 18 | [build-system] 19 | requires = ["poetry-core"] 20 | build-backend = "poetry.core.masonry.api" 21 | -------------------------------------------------------------------------------- /apps/jupyter/Makefile.jupyter: -------------------------------------------------------------------------------- 1 | # -*- makefile -*- 2 | help: 3 | @echo "This should be run as part of the Dockerfile" 4 | false 5 | 6 | apt-install: 7 | DEBIAN_FRONTEND=noninteractive apt update 8 | DEBIAN_FRONTEND=noninteractive apt -y install --no-install-recommends fonts-liberation less sudo groff-base awscli 9 | 10 | fix-notebooks: 11 | for i in /app/work/examples/*ipynb; do \ 12 | perl -i -pe 's/localhost/opensearch/ if /9200/;s,tmp/sycamore/data,/app/work/docker_volume,' $$i; \ 13 | done 14 | -------------------------------------------------------------------------------- /apps/jupyter/README.md: -------------------------------------------------------------------------------- 1 | # Aryn Jupyter integration notes 2 | 3 | * The directory /app/work/crawl_data contains data crawled by the sycamore crawlers 4 | 5 | * The directory /app/work/docker_volume is a docker volume for storing Jupyter scrypts. It will 6 | persist if you delete the quickstart directory on your host, but is not directly accessible. 7 | 8 | * The directory /app/work/bind_dir is a bind mount to the quickstart/jupyter/bind_dir directory on 9 | your host. It will be deleted if you delete the quickstart directory on your host, but is 10 | directly accesible on the host. 11 | 12 | * The directory /app/work/examples contains an example from the [Aryn Quickstart local development 13 | guide](https://github.com/aryn-ai/quickstart/blob/main/sycamore-local-development-example.md) 14 | -------------------------------------------------------------------------------- /apps/jupyter/bind_dir/BIND_MOUNT_BETWEEN_DOCKER_AND_HOST: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/apps/jupyter/bind_dir/BIND_MOUNT_BETWEEN_DOCKER_AND_HOST -------------------------------------------------------------------------------- /apps/jupyter/bind_dir/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Starting setup script." 3 | echo "Usually used to enable optional sycamore features." 4 | set -x 5 | set -e 6 | 7 | # Examples of installing additional OS packages: 8 | export DEBIAN_FRONTEND=noninteractive 9 | # sudo apt update && sudo apt -y install --no-install-recommends less 10 | 11 | # Examples of setting up optional features: 12 | # ./poetry-install.sh duckdb 13 | # ./poetry-install.sh weaviate 14 | 15 | # Setup a mapped in copy for a git checkout of sycamore 16 | maybelink() { 17 | if [[ -L $2 ]]; then 18 | : 19 | else 20 | if [[ -e $2 ]]; then 21 | mv $2 $2.orig 22 | ln -s $1 $2 23 | fi 24 | fi 25 | } 26 | 27 | if [[ -f /app/sycamore.git/README.md ]]; then 28 | echo "Found a sycamore git checkout; replacing the preinstalled docker one..." 29 | for i in README.md apps lib poetry.lock pyproject.toml; do 30 | maybelink sycamore.git/$i /app/$i 31 | done 32 | for i in examples notebooks; do 33 | maybelink ../sycamore.git/$i /app/work/$i 34 | done 35 | # Reinstall everything that's been installed 36 | for i in $(cat /app/.poetry.install); do 37 | ./poetry-install.sh $(echo $i | sed 's/,/ /') 38 | done 39 | fi 40 | -------------------------------------------------------------------------------- /apps/jupyter/profile: -------------------------------------------------------------------------------- 1 | PATH=/app/.venv/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games 2 | -------------------------------------------------------------------------------- /apps/jupyter/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sycamore-jupyter" 3 | version = "0.0.1" 4 | description = "Jupyter container with sycamore preinstalled" 5 | authors = ["aryn.ai "] 6 | readme = "README.md" 7 | repository = "https://github.com/aryn-ai/sycamore.git" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9.2,<3.13" 11 | sycamore-ai = {extras = ["opensearch"], version = "^0.1.30"} 12 | 13 | jupyterlab = "^4.0.11" 14 | jupyter-lsp = "^2.2.2" 15 | ipywidgets = "^8.1.0" 16 | notebook = "^7.1.2" 17 | 18 | 19 | [tool.poetry.group.dev.dependencies] 20 | sycamore-ai = { path = "../../lib/sycamore", extras = ["opensearch"], develop = true } 21 | 22 | [tool.poetry.group.sycamore_poetry_lock.dependencies] 23 | sycamore-poetry-lock = { path = "../../lib/poetry-lock", develop = true } 24 | -------------------------------------------------------------------------------- /apps/jupyter/sudoers: -------------------------------------------------------------------------------- 1 | # 2 | # This file MUST be edited with the 'visudo' command as root. 3 | # 4 | # Please consider adding local content in /etc/sudoers.d/ instead of 5 | # directly modifying this file. 6 | # 7 | # See the man page for details on how to write a sudoers file. 8 | # 9 | Defaults env_reset 10 | Defaults mail_badpass 11 | Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin" 12 | 13 | # This fixes CVE-2005-4890 and possibly breaks some versions of kdesu 14 | # (#1011624, https://bugs.kde.org/show_bug.cgi?id=452532) 15 | Defaults use_pty 16 | 17 | # User privilege specification 18 | root ALL=(ALL:ALL) ALL 19 | 20 | # Members of the admin group may gain root privileges 21 | %admin ALL=(ALL) ALL 22 | 23 | # Allow members of group sudo to execute any command 24 | %sudo ALL=(ALL:ALL) ALL 25 | 26 | # Since you can do docker run -u root, there is little point in restricting what can be sudoed 27 | app ALL = (ALL:ALL) NOPASSWD:SETENV: ALL 28 | 29 | app1000 ALL = (ALL:ALL) NOPASSWD:SETENV: ALL 30 | -------------------------------------------------------------------------------- /apps/opensearch/2.11/Dockerfile: -------------------------------------------------------------------------------- 1 | # Repo name: arynai/sycamore-opensearch 2 | 3 | ARG TAG=2.11.0 4 | FROM opensearchproject/opensearch:$TAG 5 | USER 0 6 | RUN yum install -y jq && yum clean all 7 | USER 1000 8 | COPY opensearch/opensearch.yml /usr/share/opensearch/config/ 9 | COPY opensearch/sycamore-opensearch.sh . 10 | ENV discovery.type=single-node 11 | ENV DISABLE_SECURITY_PLUGIN=true 12 | ENV DISABLE_INSTALL_DEMO_CONFIG=true 13 | 14 | ARG GIT_BRANCH="main" 15 | ARG GIT_COMMIT="unknown" 16 | ARG GIT_DIFF="unknown" 17 | 18 | ENV GIT_BRANCH=${GIT_BRANCH} 19 | ENV GIT_COMMIT=${GIT_COMMIT} 20 | ENV GIT_DIFF=${GIT_DIFF} 21 | 22 | LABEL org.opencontainers.image.authors="opensource@aryn.ai" 23 | LABEL git_branch=${GIT_BRANCH} 24 | LABEL git_commit=${GIT_COMMIT} 25 | LABEL git_diff=${GIT_DIFF} 26 | 27 | CMD ./sycamore-opensearch.sh 28 | -------------------------------------------------------------------------------- /apps/opensearch/2.11/README.md: -------------------------------------------------------------------------------- 1 | # Compose setup 2 | 3 | See ../docker_compose/README.md for instructions 4 | 5 | # Manual setup 6 | 7 | ## Run once: 8 | 9 | ``` 10 | docker volume create opensearch_data 11 | ``` 12 | 13 | ## Run if you change the source code: 14 | ``` 15 | cd pyscripts/docker_service/opensearch 16 | docker build -t aryn_opensearch . 17 | ``` 18 | 19 | ## Run and shutdown as you see fit: 20 | ``` 21 | docker run -it --rm --name aryn_opensearch --network aryn-app -p 9200:9200 -e OPENAI_API_KEY --volume opensearch_data:/usr/share/opensearch/data aryn/opensearch 22 | ``` 23 | 24 | You can replace -it with --detach if you want it to run in the background 25 | 26 | ## To reset the opensearch state 27 | 28 | ``` 29 | docker run --volume opensearch_data:/tmp/osd ubuntu /bin/sh -c 'rm -rf /tmp/osd/*' 30 | ``` 31 | -------------------------------------------------------------------------------- /apps/opensearch/2.11/opensearch.yml: -------------------------------------------------------------------------------- 1 | network.host: 0.0.0.0 2 | http.cors.enabled : true 3 | http.cors.allow-origin: "*" 4 | http.cors.allow-methods: OPTIONS,HEAD,GET,POST,PUT,DELETE 5 | http.cors.allow-headers: X-Requested-With,X-Auth-Token,Content-Type,Content-Length 6 | -------------------------------------------------------------------------------- /apps/opensearch/README.md: -------------------------------------------------------------------------------- 1 | # Compose setup 2 | 3 | See ../docker_compose/README.md for instructions 4 | 5 | # Manual setup 6 | 7 | ## Run once: 8 | 9 | ``` 10 | docker volume create opensearch_data 11 | ``` 12 | 13 | ## Run if you change the source code: 14 | ``` 15 | cd pyscripts/docker_service/opensearch 16 | docker build -t aryn_opensearch . 17 | ``` 18 | 19 | ## Run and shutdown as you see fit: 20 | ``` 21 | docker run -it --rm --name aryn_opensearch --network aryn-app -p 9200:9200 -e OPENAI_API_KEY --volume opensearch_data:/usr/share/opensearch/data aryn/opensearch 22 | ``` 23 | 24 | You can replace -it with --detach if you want it to run in the background 25 | 26 | ## To reset the opensearch state 27 | 28 | ``` 29 | docker run --volume opensearch_data:/tmp/osd ubuntu /bin/sh -c 'rm -rf /tmp/osd/*' 30 | ``` 31 | -------------------------------------------------------------------------------- /apps/opensearch/config.yml: -------------------------------------------------------------------------------- 1 | _meta: 2 | type: config 3 | config_version: 2 4 | 5 | # Automatically authenticate clients without credentials as 6 | # 'opendistro_security_anonymous_backendrole'. 7 | config: 8 | dynamic: 9 | http: 10 | anonymous_auth_enabled: true 11 | -------------------------------------------------------------------------------- /apps/opensearch/opensearch.yml: -------------------------------------------------------------------------------- 1 | network.host: 0.0.0.0 2 | http.cors.enabled : true 3 | http.cors.allow-origin: "*" 4 | http.cors.allow-methods: OPTIONS,HEAD,GET,POST,PUT,DELETE 5 | http.cors.allow-headers: X-Requested-With,X-Auth-Token,Content-Type,Content-Length 6 | # We are using self-signed certificates for now, so... 7 | plugins.security.ssl.transport.enforce_hostname_verification: false 8 | plugins.security.ssl.transport.pemcert_filepath: node-cert.pem 9 | plugins.security.ssl.transport.pemkey_filepath: node-key.pem 10 | plugins.security.ssl.transport.pemtrustedcas_filepath: cacert.pem 11 | plugins.security.ssl.http.enabled: true 12 | plugins.security.ssl.http.pemcert_filepath: http-cert.pem 13 | plugins.security.ssl.http.pemkey_filepath: http-key.pem 14 | plugins.security.ssl.http.pemtrustedcas_filepath: http-ca.pem 15 | plugins.security.authcz.admin_dn: 16 | - 'CN=Admin,O=Aryn.ai,ST=California,C=US' 17 | -------------------------------------------------------------------------------- /apps/opensearch/roles_mapping.yml: -------------------------------------------------------------------------------- 1 | _meta: 2 | type: rolesmapping 3 | config_version: 2 4 | 5 | # Grant all access to clients automatically authenticated. 6 | all_access: 7 | backend_roles: 8 | - opendistro_security_anonymous_backendrole 9 | -------------------------------------------------------------------------------- /apps/remote-processor-service/Makefile: -------------------------------------------------------------------------------- 1 | ### Docker steps sorted in the same order as the Dockerfile 2 | ### Not for general use so undocumented 3 | 4 | aryn_user: 5 | groupadd --gid 1000 aryn 6 | useradd --uid 1000 --gid 1000 --home-dir /aryn --password=y --no-create-home aryn 7 | chown -R aryn:aryn /aryn 8 | 9 | install_poetry: 10 | touch /var/lib/apt/.cache_var_lib_apt # make it possible to find the cache directory for buildx builds 11 | touch /var/cache/apt/.cache_var_cache_apt 12 | apt update 13 | DEBIAN_FRONTEND=noninteractive apt -y install --no-install-recommends python3-poetry gcc python3-dev protobuf-compiler 14 | 15 | common_build: 16 | test "$(POETRY_CACHE_DIR)" = /tmp/poetry_cache # catch a bug where putting ARG too early in Dockerfile doesn't get the env var 17 | touch /tmp/poetry_cache/.poetry_cache_dir 18 | poetry install --no-root --only main 19 | poetry env info 20 | 21 | docker_build_proto: 22 | test "$(POETRY_CACHE_DIR)" = /tmp/poetry_cache 23 | poetry install --no-root --only build 24 | make -f ./Makefile build_proto 25 | 26 | server_build: 27 | poetry install --only main,dev 28 | 29 | user_check: 30 | FILES=$$(find /aryn -print | wc -l); \ 31 | ARYN_FILES=$$(find /aryn -user aryn -print | wc -l); \ 32 | echo "Found $${ARYN_FILES}/$${FILES} owned by aryn"; \ 33 | find /aryn ! -user aryn -print; \ 34 | test $${FILES} -ge 1000 && \ 35 | test $${ARYN_FILES} -eq $${FILES} 36 | -------------------------------------------------------------------------------- /apps/remote-processor-service/config/pipelines.yml: -------------------------------------------------------------------------------- 1 | - debug: 2 | processors: 3 | - debug-response: 4 | - dedup00: 5 | processors: 6 | - dedup-response: 7 | threshold: 0.01 8 | - dedup01: 9 | processors: 10 | - dedup-response: 11 | threshold: 0.1 12 | - dedup02: 13 | processors: 14 | - dedup-response: 15 | threshold: 0.15 16 | - dedup03: 17 | processors: 18 | - dedup-response: 19 | threshold: 0.2 20 | - dedup04: 21 | processors: 22 | - dedup-response: 23 | threshold: 0.3 24 | - dedup05: 25 | processors: 26 | - dedup-response: 27 | threshold: 0.35 28 | - dedup06: 29 | processors: 30 | - dedup-response: 31 | threshold: 0.4 32 | - dedup07: 33 | processors: 34 | - dedup-response: 35 | threshold: 0.45 36 | - dedup08: 37 | processors: 38 | - dedup-response: 39 | threshold: 0.55 40 | -------------------------------------------------------------------------------- /apps/remote-processor-service/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "remote-processor-service" 3 | version = "0.1.0" 4 | description = "Service that runs remote processors" 5 | authors = ["Henry Lindeman "] 6 | readme = "README.md" 7 | packages = [{include = "remote_processor_service"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9.2,<3.13" 11 | click = "^8.1.7" 12 | 13 | [tool.poetry.group.dev.dependencies] 14 | remote-processors = { path = "../../lib/remote-processors", develop = true } 15 | sycamore-ai = { path = "../../lib/sycamore", develop = true } 16 | sycamore-poetry-lock = { path = "../../lib/poetry-lock", develop = true } 17 | 18 | [tool.poetry.scripts] 19 | test_cfg = "remote_processor_service.cli:read_cfg" 20 | server = "remote_processor_service.cli:serve" 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /apps/remote-processor-service/remote_processor_service/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from remote_processors.server.remote_processor_service import RemoteProcessorService 4 | 5 | 6 | @click.command() 7 | @click.argument("config", type=click.Path(exists=True)) 8 | def read_cfg(config): 9 | """ 10 | Construct the server with the configruation frovided and print info about it 11 | 12 | Args: 13 | config (filepath): A yaml file that describes all the search processors to run in the RPS 14 | """ 15 | service = RemoteProcessorService(config) 16 | print(service) 17 | print(service._pipelines) 18 | 19 | 20 | @click.command() 21 | @click.argument("config", type=click.Path(exists=True)) 22 | @click.option("--certfile", type=click.Path(exists=True), default=None) 23 | @click.option("--keyfile", type=click.Path(exists=True), default=None) 24 | def serve(config, certfile, keyfile): 25 | """ 26 | Start the server on port 2796 with the configuration provided 27 | 28 | Args: 29 | config (filepath): A yaml file that describes all the search processors to run in the RPS 30 | """ 31 | service = RemoteProcessorService(config) 32 | if certfile is None or keyfile is None: 33 | assert keyfile == certfile, "You must either specify both certfile and keyfile or specify neither" 34 | server = service.start(certfile, keyfile) 35 | server.wait_for_termination() 36 | -------------------------------------------------------------------------------- /apps/remote-processor-service/remote_processor_service/rps_docker_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | main() { 4 | create_certificates 5 | poetry run server config/pipelines.yml --keyfile config/rps-key.pem --certfile config/rps-cert.pem 6 | } 7 | 8 | die() { 9 | echo "ERROR:" "$@" 1>&2 10 | if [[ ${NOEXIT} -gt 0 ]]; then 11 | echo "Not dying due to NOEXIT. Feel free to poke around container." 12 | sleep inf 13 | fi 14 | exit 1 15 | } 16 | 17 | create_certificates() { 18 | local HOST="${SSL_HOSTNAME:-localhost}" 19 | local DAYS=10000 20 | local LOG="config/openssl.err" 21 | truncate -s 0 "${LOG}" 22 | 23 | # Create RPS certificate. 24 | if [[ (! -f config/rps-key.pem) || (! -f config/rps-cert.pem) ]]; then 25 | openssl req -batch -x509 -newkey rsa:4096 -days "${DAYS}" \ 26 | -subj "/C=US/ST=California/O=Aryn.ai/CN=${HOST}" \ 27 | -extensions v3_req -addext "subjectAltName=DNS:${HOST}" \ 28 | -noenc -keyout "config/rps-key.pem" -out "config/rps-cert.pem" 2> /dev/null 29 | echo "Created RPS certificate" 30 | fi 31 | 32 | for X in rps-key.pem rps-cert.pem; do 33 | chmod 600 "config/${X}" 34 | done 35 | } 36 | 37 | main 38 | -------------------------------------------------------------------------------- /apps/timetrace/Makefile: -------------------------------------------------------------------------------- 1 | CXX := g++ --std=c++20 -W -Wall -O3 2 | 3 | EXES := ttviz 4 | 5 | all: $(EXES) 6 | 7 | clean: 8 | rm -f $(EXES) *.o *.a 9 | 10 | ttviz: ttviz.cpp 11 | $(CXX) $^ -lgd -o $@ 12 | -------------------------------------------------------------------------------- /apps/timetrace/README.md: -------------------------------------------------------------------------------- 1 | The ttviz program is a small utility for visualize the TimeTrace output for Sycamore scripts. 2 | 3 | ## Basic Usage 4 | 5 | Compile ttviz 6 | 7 | make 8 | 9 | Run a Scyamore script with TimeTrace enabled: 10 | 11 | TIMETRACE=/path/to/output/ poetry run python my_scamore_script.py 12 | 13 | Run ttviz on output. 14 | 15 | ./ttviz /path/to/output/* 16 | 17 | The visualization is written to `viz.png` 18 | 19 | ## Mac OS X Setup Instructions 20 | 21 | This program depends on the `gd` utility. The easiest way to install this on the Mac is using Homebrew: 22 | 23 | brew install libgd 24 | 25 | in order for the compiler to find the new library, you need to set the `CPATH` and `LIBRARY_PATH` environment variables to pick up Homebrew installed libraries. Depending on your shell and setup, the following may work 26 | 27 | export CPATH=$HOMEBREW_PREFIX:$CPATH 28 | export LIBRARY_PATH=$HOMEBREW_PREFIX:$LIBRARY_PATH 29 | 30 | Typically `HOMEBREW_PREFIX` should be set to `/opt/homebrew`. 31 | -------------------------------------------------------------------------------- /apps/timetrace/ttcat: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys 4 | import struct 5 | from typing import NamedTuple 6 | 7 | FMT = "BxxxIQQQQQ48s" 8 | SIZE = struct.calcsize(FMT) 9 | 10 | 11 | class TimeTraceRec(NamedTuple): 12 | thread: int 13 | t0: int 14 | t1: int 15 | utime: int 16 | stime: int 17 | rss: int 18 | name: str 19 | 20 | 21 | def buf_to_rec(buf) -> TimeTraceRec: 22 | tup = struct.unpack(FMT, buf) 23 | ver, thr, t0, t1, user, syst, rss, name = tup 24 | assert ver == 0 25 | name = name.decode().rstrip("\0") 26 | return TimeTraceRec(thr, t0, t1, user, syst, rss, name) 27 | 28 | 29 | def tt_reader(paths: list[str]): 30 | for path in paths: 31 | with open(path, "rb") as fp: 32 | while True: 33 | buf = fp.read(SIZE) 34 | if not buf: 35 | break 36 | yield buf_to_rec(buf) 37 | 38 | 39 | class Cat: 40 | def run(self, paths): 41 | for rec in tt_reader(paths): 42 | t0 = rec.t0 / 1000000000.0 43 | t1 = rec.t1 / 1000000000.0 44 | wall = t1 - t0 45 | rss = rec.rss / 1048576.0 46 | print(f"{t0:.3f} {rec.name} {wall:.6f} {rss:.0f}") 47 | 48 | 49 | def main(args=None): 50 | if args is None: 51 | args = sys.argv[1:] 52 | cat = Cat() 53 | cat.run(args) 54 | return 0 55 | 56 | 57 | if __name__ == "__main__": 58 | sys.exit(main()) 59 | -------------------------------------------------------------------------------- /autogen-groups.py: -------------------------------------------------------------------------------- 1 | import tomllib 2 | 3 | with open("lib/sycamore/pyproject.toml", "rb") as f: 4 | pyproject = tomllib.load(f) 5 | 6 | extras = [e for e in pyproject["tool"]["poetry"]["extras"].keys()] 7 | extras.sort() 8 | print() 9 | print("### BEGIN Auto-generated by autogen-groups.py") 10 | for e in extras: 11 | print(f"[tool.poetry.group.{e}.dependencies]") 12 | print(f'sycamore-ai = {{ path = "lib/sycamore", develop = true, extras = ["{e}"] }}') 13 | print() 14 | print("# END Auto-generated by autogen-groups.py") 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= poetry run sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @echo "make serve-docs -- make and serve the docs on http://localhost:8000/" 14 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 15 | 16 | serve-docs: 17 | make html 18 | (cd build/html && poetry run python -m http.server) 19 | 20 | 21 | .PHONY: help serve-docs Makefile 22 | 23 | # Catch-all target: route all unknown targets to Sphinx using the new 24 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 25 | %: Makefile 26 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 27 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/images/ArynArchitecture_APS+Sycamorev2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/images/ArynArchitecture_APS+Sycamorev2.png -------------------------------------------------------------------------------- /docs/source/images/SycamoreDataflowDiagramv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/images/SycamoreDataflowDiagramv2.png -------------------------------------------------------------------------------- /docs/source/images/SycamoreDiagram2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/images/SycamoreDiagram2.png -------------------------------------------------------------------------------- /docs/source/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/images/favicon.ico -------------------------------------------------------------------------------- /docs/source/sycamore/APIs.rst: -------------------------------------------------------------------------------- 1 | Sycamore APIs 2 | ============= 3 | 4 | This is the API reference for Sycamore, and it contains the functions you can use when writing 5 | Sycamore scripts to process or query data. If you are interested in contributing new transforms 6 | to the Sycamore project, please visit the Low-Level Transforms section in the API docs. 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | 11 | ./APIs/config.rst 12 | ./APIs/context.rst 13 | ./APIs/docset.rst 14 | ./APIs/docsetreader.rst 15 | ./APIs/docsetwriter.rst 16 | ./APIs/llm.rst 17 | ./APIs/prompts.rst 18 | ./APIs/document.rst 19 | ./APIs/functions.rst 20 | ./APIs/node.rst 21 | ./APIs/query.rst 22 | ./APIs/low_level_transforms.rst 23 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/context.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | Context 4 | ================ 5 | 6 | .. automodule:: sycamore.context 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/docset.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | DocSet 4 | ================ 5 | 6 | .. automodule:: sycamore.docset 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/docsetreader.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | DocSetReader 4 | ================ 5 | 6 | .. automodule:: sycamore.reader 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/docsetwriter.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | DocSetWriter 4 | ================ 5 | 6 | .. automodule:: sycamore.writer 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/document.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | Document 4 | ================ 5 | 6 | .. automodule:: sycamore.data.document 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/functions.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Functions: 2 | 3 | Functions 4 | =========== 5 | 6 | .. automodule:: sycamore.functions 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/gen: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Auto-generate RST files from Python source. 5 | 6 | Usage: ./gen 7 | """ 8 | 9 | import os 10 | import sys 11 | import ast 12 | 13 | 14 | srcRoot = "../../../lib/sycamore/sycamore" 15 | docRoot = "." 16 | 17 | 18 | def shouldEmit(node): 19 | if not isinstance(node, ast.ClassDef): 20 | return False 21 | if ast.get_docstring(node): 22 | return True 23 | for base in node.bases: 24 | if base.id == "ABC": 25 | return False # skip abstract base classes 26 | return True 27 | 28 | 29 | def doFile(name, dir, ent): 30 | with open(f"{dir}/{ent}") as fp: 31 | top = ast.parse(fp.read()) 32 | 33 | ary = [] 34 | base = ent[:-3] 35 | for node in top.body: # iterate module-level nodes only 36 | if shouldEmit(node): 37 | ary.append(f"sycamore.{name}.{base}.{node.name}") 38 | 39 | if ary: 40 | with open(f"{docRoot}/{name}/{base}.rst", "w") as fp: 41 | title = base.replace("_", " ").title() 42 | line = "=" * len(title) 43 | fp.write(f"{title}\n{line}\n\n") 44 | for sym in sorted(ary): 45 | fp.write(f".. autoclass:: {sym}\n :members:\n :show-inheritance:\n") 46 | print(f" /APIs/{name}/{base}.rst") 47 | 48 | 49 | def doDir(name): 50 | dir = f"{srcRoot}/{name}" 51 | for ent in sorted(os.listdir(dir)): 52 | if not ent.endswith(".py"): 53 | continue 54 | doFile(name, dir, ent) 55 | 56 | 57 | def main(): 58 | doDir("transforms") 59 | return 0 60 | 61 | 62 | if __name__ == "__main__": 63 | sys.exit(main()) 64 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/llm.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | LLM 4 | ================ 5 | 6 | .. autoclass:: sycamore.llms.llms.LLM 7 | :members: 8 | :exclude-members: generate_old 9 | .. autoenum:: sycamore.llms.llms.LLMMode 10 | :members: 11 | .. automodule:: sycamore.llms.openai 12 | :members: 13 | :show-inheritance: 14 | .. automodule:: sycamore.llms.anthropic 15 | :members: 16 | :show-inheritance: 17 | .. automodule:: sycamore.llms.gemini 18 | :members: 19 | :show-inheritance: 20 | .. automodule:: sycamore.llms.bedrock 21 | :members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-low_level_Transforms: 2 | 3 | Low-Level Transforms (for Sycamore development) 4 | =========== 5 | 6 | .. note:: 7 | Users of Sycamore won't need to interact with these classes and should instead use the classes in the top-level API docs. These transform classes are primarily of interest to developers looking to extend Sycamore or contribute to the project. 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | ./low_level_transforms/assign_doc_properties.rst 13 | ./low_level_transforms/augment_text.rst 14 | ./low_level_transforms/basics.rst 15 | ./low_level_transforms/bbox_merge.rst 16 | ./low_level_transforms/embed.rst 17 | ./low_level_transforms/explode.rst 18 | ./low_level_transforms/extract_entity.rst 19 | ./low_level_transforms/extract_schema.rst 20 | ./low_level_transforms/extract_table.rst 21 | ./low_level_transforms/extract_table_properties.rst 22 | ./low_level_transforms/llm_map.rst 23 | ./low_level_transforms/llm_query.rst 24 | ./low_level_transforms/map.rst 25 | ./low_level_transforms/mark_misc.rst 26 | ./low_level_transforms/merge_elements.rst 27 | ./low_level_transforms/partition.rst 28 | ./low_level_transforms/query.rst 29 | ./low_level_transforms/random_sample.rst 30 | ./low_level_transforms/regex_replace.rst 31 | ./low_level_transforms/sketcher.rst 32 | ./low_level_transforms/split_elements.rst 33 | ./low_level_transforms/spread_properties.rst 34 | ./low_level_transforms/standardizer.rst 35 | ./low_level_transforms/summarize.rst 36 | ./low_level_transforms/summarize_images.rst 37 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/assign_doc_properties.rst: -------------------------------------------------------------------------------- 1 | Assign Doc Properties 2 | ============= 3 | 4 | .. autoclass:: sycamore.transforms.assign_doc_properties.AssignDocProperties 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/augment_text.rst: -------------------------------------------------------------------------------- 1 | Augment Text 2 | ============ 3 | 4 | .. autoclass:: sycamore.transforms.augment_text.AugmentText 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.augment_text.JinjaTextAugmentor 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.augment_text.UDFTextAugmentor 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/basics.rst: -------------------------------------------------------------------------------- 1 | Basics 2 | ====== 3 | 4 | .. autoclass:: sycamore.transforms.basics.Filter 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.basics.Limit 8 | :members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/bbox_merge.rst: -------------------------------------------------------------------------------- 1 | Bbox Merge 2 | ========== 3 | 4 | .. autoclass:: sycamore.transforms.bbox_merge.MarkBreakByColumn 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.bbox_merge.MarkDropHeaderFooter 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.bbox_merge.SortByPageBbox 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/detr_partitioner.rst: -------------------------------------------------------------------------------- 1 | Detr Partitioner 2 | ================ 3 | 4 | .. autoclass:: sycamore.transforms.detr_partitioner.DeformableDetr 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.detr_partitioner.PDFMinerExtractor 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.detr_partitioner.SycamoreObjectDetection 11 | :members: 12 | :show-inheritance: 13 | .. autoclass:: sycamore.transforms.detr_partitioner.ArynPDFPartitioner 14 | :members: 15 | :show-inheritance: 16 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/embed.rst: -------------------------------------------------------------------------------- 1 | Embed 2 | ===== 3 | 4 | .. autoclass:: sycamore.transforms.embed.BedrockEmbedder 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.embed.BedrockEmbeddingModels 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.embed.Embed 11 | :members: 12 | :show-inheritance: 13 | .. autoclass:: sycamore.transforms.embed.OpenAIEmbedder 14 | :members: 15 | :show-inheritance: 16 | .. autoclass:: sycamore.transforms.embed.OpenAIEmbeddingModels 17 | :members: 18 | :show-inheritance: 19 | .. autoclass:: sycamore.transforms.embed.SentenceTransformerEmbedder 20 | :members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/explode.rst: -------------------------------------------------------------------------------- 1 | Explode 2 | ======= 3 | 4 | .. autoclass:: sycamore.transforms.explode.Explode 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/extract_entity.rst: -------------------------------------------------------------------------------- 1 | Extract Entity 2 | ============== 3 | 4 | .. autoclass:: sycamore.transforms.extract_entity.ExtractEntity 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.extract_entity.OpenAIEntityExtractor 8 | :members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/extract_schema.rst: -------------------------------------------------------------------------------- 1 | Extract Schema 2 | ============== 3 | 4 | .. autoclass:: sycamore.transforms.extract_schema.ExtractBatchSchema 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.extract_schema.ExtractProperties 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.extract_schema.ExtractSchema 11 | :members: 12 | :show-inheritance: 13 | .. autoclass:: sycamore.transforms.extract_schema.LLMPropertyExtractor 14 | :members: 15 | :show-inheritance: 16 | .. autoclass:: sycamore.transforms.extract_schema.OpenAIPropertyExtractor 17 | :members: 18 | :show-inheritance: 19 | .. autoclass:: sycamore.transforms.extract_schema.OpenAISchemaExtractor 20 | :members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/extract_table.rst: -------------------------------------------------------------------------------- 1 | Extract Table 2 | ============= 3 | 4 | .. autoclass:: sycamore.transforms.extract_table.CachedTextractTableExtractor 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.extract_table.MissingS3UploadPath 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.extract_table.TextractTableExtractor 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/extract_table_properties.rst: -------------------------------------------------------------------------------- 1 | Extract Table Properties 2 | ============= 3 | 4 | .. autoclass:: sycamore.transforms.extract_table_properties.ExtractTableProperties 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/llm_map.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | LLM Map 4 | ====== 5 | 6 | .. autoclass:: sycamore.transforms.base_llm.LLMMap 7 | :members: 8 | :show-inheritance: 9 | 10 | .. autoclass:: sycamore.transforms.base_llm.LLMMapElements 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/llm_query.rst: -------------------------------------------------------------------------------- 1 | LLM Query 2 | ============= 3 | 4 | .. autoclass:: sycamore.transforms.llm_query.LLMTextQueryAgent 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/map.rst: -------------------------------------------------------------------------------- 1 | Map 2 | === 3 | 4 | .. autoclass:: sycamore.transforms.map.FlatMap 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.map.Map 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.map.MapBatch 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/mark_misc.rst: -------------------------------------------------------------------------------- 1 | Mark Misc 2 | ========= 3 | 4 | .. autoclass:: sycamore.transforms.mark_misc.MarkBreakByTokens 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.mark_misc.MarkBreakPage 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.mark_misc.MarkDropTiny 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/merge_elements.rst: -------------------------------------------------------------------------------- 1 | Merge Elements 2 | ============== 3 | 4 | .. autoclass:: sycamore.transforms.merge_elements.ElementMerger 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.merge_elements.GreedySectionMerger 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.merge_elements.GreedyTextElementMerger 11 | :members: 12 | :show-inheritance: 13 | .. autoclass:: sycamore.transforms.merge_elements.MarkedMerger 14 | :members: 15 | :show-inheritance: 16 | .. autoclass:: sycamore.transforms.merge_elements.TableMerger 17 | :members: 18 | :show-inheritance: 19 | .. autoclass:: sycamore.transforms.merge_elements.HeaderAugmenterMerger 20 | :members: 21 | :show-inheritance: 22 | .. autoclass:: sycamore.transforms.merge_elements.Merge 23 | :members: 24 | :show-inheritance: 25 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/partition.rst: -------------------------------------------------------------------------------- 1 | Partition 2 | ========= 3 | 4 | .. autoclass:: sycamore.transforms.partition.HtmlPartitioner 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.partition.Partition 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.partition.ArynPartitioner 11 | :members: 12 | :show-inheritance: 13 | .. autoclass:: sycamore.transforms.partition.SycamorePartitioner 14 | :members: 15 | :show-inheritance: 16 | .. autoclass:: sycamore.transforms.partition.UnstructuredPPTXPartitioner 17 | :members: 18 | :show-inheritance: 19 | .. autoclass:: sycamore.transforms.partition.UnstructuredPdfPartitioner 20 | :members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/query.rst: -------------------------------------------------------------------------------- 1 | Query 2 | ===== 3 | 4 | .. autoclass:: sycamore.transforms.query.OpenSearchQueryExecutor 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.query.Query 8 | :members: 9 | :show-inheritance: 10 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/random_sample.rst: -------------------------------------------------------------------------------- 1 | Random Sample 2 | ============= 3 | 4 | .. autoclass:: sycamore.transforms.random_sample.RandomSample 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/regex_replace.rst: -------------------------------------------------------------------------------- 1 | Regex Replace 2 | ============= 3 | 4 | .. autoclass:: sycamore.transforms.regex_replace.RegexReplace 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/sketcher.rst: -------------------------------------------------------------------------------- 1 | Sketcher 2 | ======== 3 | 4 | .. autoclass:: sycamore.transforms.sketcher.SketchDebug 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.sketcher.SketchUniquify 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.sketcher.Sketcher 11 | :members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/split_elements.rst: -------------------------------------------------------------------------------- 1 | Split Elements 2 | ============== 3 | 4 | .. autoclass:: sycamore.transforms.split_elements.SplitElements 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/spread_properties.rst: -------------------------------------------------------------------------------- 1 | Spread Properties 2 | ================= 3 | 4 | .. autoclass:: sycamore.transforms.spread_properties.SpreadProperties 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/standardizer.rst: -------------------------------------------------------------------------------- 1 | Standarizer 2 | ============= 3 | 4 | 5 | .. autoclass:: sycamore.transforms.standardizer.Standardizer 6 | :members: 7 | :show-inheritance: 8 | 9 | .. autoclass:: sycamore.transforms.standardizer.DateTimeStandardizer 10 | :members: 11 | :show-inheritance: 12 | 13 | .. autoclass:: sycamore.transforms.standardizer.USStateStandardizer 14 | :members: 15 | :show-inheritance: 16 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/summarize.rst: -------------------------------------------------------------------------------- 1 | Summarize 2 | ========= 3 | 4 | .. autoclass:: sycamore.transforms.summarize.Summarizer 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.summarize.LLMElementTextSummarizer 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.summarize.MultiStepDocumentSummarizer 11 | :members: 12 | :show-inheritance: 13 | .. py:data:: sycamore.transforms.summarize.MaxTokensHierarchyPrompt 14 | .. autoclass:: sycamore.transforms.summarize.OneStepDocumentSummarizer 15 | :members: 16 | :show-inheritance: 17 | .. py:data:: sycamore.transforms.summarize.OneStepSummarizerPrompt 18 | .. autoclass:: sycamore.transforms.summarize.EtCetera 19 | .. autoclass:: sycamore.transforms.summarize.Summarize 20 | :members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/low_level_transforms/summarize_images.rst: -------------------------------------------------------------------------------- 1 | Summarize Images 2 | ================ 3 | 4 | .. autoclass:: sycamore.transforms.summarize_images.LLMImageSummarizer 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.transforms.summarize_images.GeminiImageSummarizer 8 | :members: 9 | :show-inheritance: 10 | .. autoclass:: sycamore.transforms.summarize_images.OpenAIImageSummarizer 11 | :members: 12 | :show-inheritance: 13 | .. autoclass:: sycamore.transforms.summarize_images.SummarizeImages 14 | :members: 15 | :show-inheritance: 16 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/node.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Data: 2 | 3 | Node 4 | ================ 5 | 6 | .. automodule:: sycamore.plan_nodes 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/prompts.rst: -------------------------------------------------------------------------------- 1 | Prompts 2 | ======= 3 | 4 | .. autoclass:: sycamore.llms.prompts.prompts.RenderedPrompt 5 | :members: 6 | :show-inheritance: 7 | .. autoclass:: sycamore.llms.prompts.prompts.RenderedMessage 8 | :members: 9 | :show-inheritance: 10 | .. autovariable:: sycamore.llms.prompts.prompts.ResponseFormat 11 | .. autoclass:: sycamore.llms.prompts.prompts.SycamorePrompt 12 | :members: 13 | :show-inheritance: 14 | .. autoclass:: sycamore.llms.prompts.prompts.JinjaPrompt 15 | :members: 16 | :show-inheritance: 17 | .. autoclass:: sycamore.llms.prompts.prompts.JinjaElementPrompt 18 | :members: 19 | :show-inheritance: 20 | .. autoclass:: sycamore.llms.prompts.prompts.StaticPrompt 21 | :members: 22 | :show-inheritance: 23 | .. autoclass:: sycamore.llms.prompts.prompts.ElementPrompt 24 | :members: 25 | :show-inheritance: 26 | -------------------------------------------------------------------------------- /docs/source/sycamore/APIs/query.rst: -------------------------------------------------------------------------------- 1 | .. _Ref-Query: 2 | 3 | Query 4 | ============== 5 | 6 | This package allows you to build sophisticated LLM query-powered pipelines using Sycamore. 7 | 8 | .. automodule:: sycamore.query.client 9 | :members: 10 | 11 | .. automodule:: sycamore.query.planner 12 | :members: 13 | 14 | .. automodule:: sycamore.query.logical_plan 15 | :members: 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/source/sycamore/connectors.rst: -------------------------------------------------------------------------------- 1 | Connectors 2 | ============= 3 | 4 | Writing a DocSet to Target Data Stores 5 | -------------------------------------- 6 | 7 | A final step in a Sycamore processing job is to load the data into a target database for use in your application. This could be a combination of a vector index and term-based index, and includes the enriched metadata from the job. Currently, Pinecone, Weaviate, Elasticsearch, DuckDB, Opensearch and Qdrant are supported as target databases. Users can access this from a unified write interface, from where they can specify their databases and the respective connection and write arguments. 8 | 9 | Further information for each supported database and its relevant documentation is given below: 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | ./connectors/duckdb.md 15 | ./connectors/weaviate.md 16 | ./connectors/pinecone.md 17 | ./connectors/elasticsearch.md 18 | ./connectors/opensearch.md 19 | ./connectors/neo4j.md 20 | ./connectors/qdrant.md 21 | -------------------------------------------------------------------------------- /docs/source/sycamore/get_started/ai_configuration.md: -------------------------------------------------------------------------------- 1 | # AI Configuration 2 | 3 | ## Aryn Partitioner 4 | 5 | The Aryn Partitioner is the recommended way to process PDF documents in Sycamore, and it uses a state-of-the-art, open source deep learning AI model trained on 80k+ enterprise documents. By default, it's configured to use [Aryn DocParse](https://docs.aryn.ai/quickstart), and you will need to set your API key. You can sign-up for free [here](https://www.aryn.ai/get-started) to get an API key for the service. 6 | 7 | 8 | ## LLM-based Transforms 9 | Sycamore brings generative AI to a variety of stages in an ETL pipeline. You can choose different LLMs for entity extraction, schema extraction, and more. Currently, Sycamore supports OpenAI and Amazon Bedrock, and you will need to set your credentials for these services for Sycamore to use. 10 | 11 | Information on supported generative AI models for each operation are in the specific documentation for it: 12 | 13 | * [Entity extraction](./transforms/extract_entity.md) 14 | * [Schema extraction](./transforms/extract_schema.md) 15 | * [Summarize](./transforms/summarize.md) 16 | 17 | ## Creating Vector Embeddings 18 | A final step before loading a vector database is creating vector embeddings for your data. Currently, Sycamore supports OpenAI and Amazon Bedrock, and you will need to set your credentials for these services for Sycamore to use. 19 | 20 | For more information on creating embeddings, visit [here](./transforms/embed.md). -------------------------------------------------------------------------------- /docs/source/sycamore/get_started/hardware.md: -------------------------------------------------------------------------------- 1 | # Recommended resources to run Sycamore 2 | 3 | We recommend at least 4GB of RAM to run Sycamore. 4 | 5 | We recommend using the Aryn Partitioner for the highest quality PDF processing. By default, the Aryn Partitioner is configured to use Aryn DocParse, a GPU-backed serverless endpoint. You can sign up for free [here](https://www.aryn.ai/get-started), and use it with your Sycamore job. 6 | 7 | If you choose to configure your Aryn Partitioner to run locally, your performance will vary without using an NVIDIA GPU. -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/APIs/conversation_memory.rst: -------------------------------------------------------------------------------- 1 | Conversation Memory 2 | ============ 3 | 4 | These APIs are used for interacting with the conversation memory feature set directly. For more information, visit the :doc:`conversation memory overview<../conversation_memory/overview.md>` page in the documentation. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | /APIs/conversation_memory/functions.md 10 | -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/imgs/ConversationMemoryMultiAgent.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/imgs/ConversationMemoryMultiAgent.jpg -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/imgs/ConversationMemoryMultiAgent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/imgs/ConversationMemoryMultiAgent.png -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/imgs/resource-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/imgs/resource-diagram.png -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/conversation_memory/storage_for_genai_agents.md: -------------------------------------------------------------------------------- 1 | # Storage for Generative AI Agents 2 | 3 | Developers use generative AI models to do all sorts of tasks, and can combine these models through a technique called Chain-of-Thought (CoT) reasoning. Instead of asking an LLM to solve a problem, you tell it what tools it has at its disposal and ask it to walk through the steps it would take to solve the problem. This LLM can communicate with other generative AI models, called Agents, to carry out these specific tasks. In this way, you can break down complicated tasks into specific tasks, with each task type carried out by a specific agent. 4 | 5 | 6 | ![Untitled](imgs/ConversationMemoryMultiAgent.jpg) 7 | 8 | 9 | With multi-agent applications that are driven by natural language requests, it is essential to have a single source of truth for the conversation history. Multiple agents should be able to read the history of the same conversation, know where each interaction came from, and add their interactions in the CoT pipeline. Not only does this allow an agent to use the context of previous interactions, but also to reference the logic and authority of other agents involved in the conversation. 10 | 11 | Sycamore's conversation memory APIs can be used to store this data for generative AI agents. For an example, visit [this tutorial](../tutorials/conversational_memory_with_langchain.md). 12 | -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/demo_query_ui.md: -------------------------------------------------------------------------------- 1 | ## Using the Demo Query UI 2 | 3 | Sycamore includes a simple UI to submit queries and display retreival-augmented generation (RAG) answers and hybrid search results. The UI is deployed in the container `sycamore-demo-ui` and is accessed at `http://localhost:3000`. It is an easy way to test the answer quality from your Sycamore stack, and how iterating with your data preparation and enrichment affects the overal quality. 4 | 5 | You can create a new conversation by entering the name in the text box in the "Conversations" panel and hit enter. Make sure the conversation is selected in the left panel. Then, you can submit questions in the middle panel. 6 | 7 | The screenshot below shows the demo query UI being used to answer questions on a 10-K document (public financial data): 8 | 9 | ![Untitled](imgs/xlarge_DemoUI_FollowUpQuestion.png) 10 | -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/imgs/pipeline-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/sycamore/querying_data/using_aryn_opensearch_stack/imgs/pipeline-architecture.png -------------------------------------------------------------------------------- /docs/source/sycamore/querying_data/using_aryn_opensearch_stack/imgs/xlarge_DemoUI_FollowUpQuestion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/docs/source/sycamore/querying_data/using_aryn_opensearch_stack/imgs/xlarge_DemoUI_FollowUpQuestion.png -------------------------------------------------------------------------------- /docs/source/sycamore/transforms.rst: -------------------------------------------------------------------------------- 1 | Transforms 2 | ============ 3 | 4 | In Sycamore, a transform is a method that operates on a ``DocSet`` and returns a new ``DocSet``. Sycamore provides a number of these transforms directly in the ``DocSet`` class to prepare and enhance your unstructured data. In order to support a variety of data types and machine learning models, many of these transforms are customizable with different implementations. 5 | 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | ./transforms/embed.md 11 | ./transforms/explode.md 12 | ./transforms/extract_entity.md 13 | ./transforms/extract_schema.md 14 | ./transforms/filter.md 15 | ./transforms/flatmap.md 16 | ./transforms/llm_query.md 17 | ./transforms/map.md 18 | ./transforms/map_batch.md 19 | ./transforms/materialize.md 20 | ./transforms/merge.md 21 | ./transforms/partition.md 22 | ./transforms/sketch.md 23 | ./transforms/summarize.md 24 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/explode.md: -------------------------------------------------------------------------------- 1 | ## Explode 2 | 3 | The Explode transform converts the elements of each document into top-level documents. For example, if you explode a ``DocSet`` with a single document containing two elements, the resulting ``DocSet`` will have three documents -- the original plus a new ``Document`` for each of the elements. 4 | 5 | ```python 6 | exploded_doc_set = docset.explode() 7 | ``` 8 | 9 | The primary use of the explode transform is to embed and ingest chunks of your document, the elements, as independent records in a data store like OpenSearch. 10 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/extract_entity.md: -------------------------------------------------------------------------------- 1 | ## ExtractEntity 2 | The Extract Entity Transform extracts semantically meaningful information from your documents. The ``OpenAIEntityExtractor`` leverages one of OpenAI's LLMs to perform this extraction with just a few examples. These extracted entities are then incorporated as properties into the document structure. The following code shows how to provide an example template for extracting a title using the gpt-3.5-turbo model. 3 | 4 | ```python 5 | openai_llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value) 6 | title_prompt_template = """ 7 | ELEMENT 1: Jupiter's Moons 8 | ELEMENT 2: Ganymede 2020 9 | ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011 10 | ELEMENT 4: From Wikipedia, the free encyclopedia 11 | ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon. 12 | ========= 13 | "Ganymede 2020 14 | """ 15 | 16 | docset = docset.extract_entity(entity_extractor=OpenAIEntityExtractor("title", llm=openai_llm, prompt_template=title_prompt_template)) 17 | ``` 18 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/extract_schema.md: -------------------------------------------------------------------------------- 1 | ## Extract Schema 2 | The Extract Schema Transform allows you to extract a semantically meaningful schema for your documents. These schemas can then by populated using the Extract Properties transform. 3 | 4 | The first step is to use Extract Schema to associate each document with a schema. Here, a Schema is a set of metadata from your document. For example, given a credit card agreement PDF, we extract the following: 5 | 6 | ```python 7 | credit_docs = credit_docs.extract_batch_schema( 8 | schema_extractor=OpenAISchemaExtractor("CreditCardContract", llm=openai, num_of_elements=50) 9 | ) 10 | ``` 11 | Which will produce JSON-schema formatted metadata, stored in each Document's `properties["_schema"]`: 12 | ```json 13 | { 14 | "type": "object", 15 | "properties": { 16 | "creditCardName": { 17 | "type": "string" 18 | }, 19 | "aprPurchases": { 20 | "type": "string" 21 | }, 22 | "annualFee": { 23 | "type": "string" 24 | } 25 | }, 26 | } 27 | ``` 28 | 29 | Once a schema is extracted, we can populate the values using the Extract Properties transform. 30 | ```python 31 | credit_docs.extract_properties(property_extractor=OpenAIPropertyExtractor(llm=openai, num_of_elements=50)) 32 | ``` 33 | The values will be extracted from the document: 34 | ```json 35 | {"creditCardName": "Dollar Bank Secured Credit Card - Variable Rate Line of Credit Agreement", 36 | "aprPurchases": "12.24%", 37 | "annualFee": "$15.00"}, 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/flatmap.md: -------------------------------------------------------------------------------- 1 | ## FlatMap 2 | The FlatMap transform takes a function from a single ``Document`` to a list of ``Documents``, and returns then "flattens" the result into a single ``DocSet``. In the following example, the FlatMap transform outputs a new list of documents 3 | where each document includes elements from a single page only. 4 | ```python 5 | def split_and_convert_to_image(doc: Document) -> list[Document]: 6 | if doc.binary_representation is not None: 7 | images = pdf2image.convert_from_bytes(doc.binary_representation) 8 | else: 9 | return [doc] 10 | 11 | elements_by_page: dict[int, list[Element]] = {} 12 | 13 | for e in doc.elements: 14 | page_number = e.properties["page_number"] 15 | elements_by_page.setdefault(page_number, []).append(e) 16 | 17 | new_docs = [] 18 | for page, elements in elements_by_page.items(): 19 | new_doc = Document(elements={"array": elements}) 20 | new_doc.properties.update(doc.properties) 21 | new_doc.properties.update({"page_number": page}) 22 | new_docs.append(new_doc) 23 | return new_docs 24 | 25 | docset = docset.flat_map(split_and_convert_to_image) 26 | ``` 27 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/map.md: -------------------------------------------------------------------------------- 1 | ## Map 2 | The Map transform takes a function that takes a ``Document`` and returns a ``Document``, 3 | and applies it to each document in the ``DocSet``. 4 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/map_batch.md: -------------------------------------------------------------------------------- 1 | ## MapBatch 2 | The MapBatch transform is similar to ``Map``, except that it processes a list of documents and returns a list of documents. ``MapBatches`` is ideal for transformations that get performance benefits from batching. 3 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/sketch.md: -------------------------------------------------------------------------------- 1 | ## Sketch 2 | The `sketch` transform adds metadata to each Document containing a sketch that can be used to identify near-duplicate documents. This process is the prerequisite for later removing or collapsing near-duplicate documents. Currently, the sketch consists of a set of hash values called `shingles`. These are relatively inexpensive to calculate and can safely be a default part of any ingestion pipeline. Using `sketch` in a Sycamore data prep pipeline is relatively easy: 3 | 4 | ```python 5 | docset = (context.read.binary(...) 6 | .partition(...) 7 | .explode() 8 | .sketch() 9 | .embed(...)) 10 | ``` 11 | 12 | Query-time de-duplication is explained [here](../querying_data/dedup.md). For more information, see the documentation for [Sketcher](../../APIs/transforms/sketcher.html#sycamore.transforms.sketcher.Sketcher). 13 | -------------------------------------------------------------------------------- /docs/source/sycamore/transforms/summarize.md: -------------------------------------------------------------------------------- 1 | ## Summarize 2 | Similar to the extract entity transform, the summarize transform generates summaries of documents or elements. The ``LLMElementTextSummarizer`` summarizes a subset of the elements from each Document. It takes an LLM implementation and a callable specifying the subset of elements to summarize. The following example shows how to use this transform to summarize elements that are longer than a certain length. 3 | 4 | ```python 5 | def filter_elements_on_length( 6 | document: Document, 7 | minimum_length: int = 10, 8 | ) -> list[Element]: 9 | def filter_func(element: Element): 10 | if element.text_representation is not None: 11 | return len(element.text_representation) > minimum_length 12 | 13 | return filter_elements(document, filter_func) 14 | 15 | llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value) 16 | 17 | docset = docset.summarize(LLMElementTextSummarizer(llm, filter_elements_on_length)) 18 | ``` 19 | -------------------------------------------------------------------------------- /docs/source/sycamore/tutorials.rst: -------------------------------------------------------------------------------- 1 | Vector Database Ingestion Examples 2 | ============= 3 | 4 | 5 | Now that you've learned about Sycamore concepts, transforms, and connectors, let's put it all together with some tutorials showing how to write Sycamore processing jobs. 6 | 7 | Some tutorials are located below, and visit the `Aryn blog `_ for more examples. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | ./tutorials/etl_pinecone_tutorial.md 13 | ./tutorials/etl_for_opensearch.md 14 | ./tutorials/etl_for_weaviate_tutorial.md 15 | -------------------------------------------------------------------------------- /docs/source/sycamore/tutorials/etl_for_weaviate_tutorial.md: -------------------------------------------------------------------------------- 1 | # Loading Weaviate with Sycamore 2 | 3 | [This tutorial](https://github.com/aryn-ai/sycamore/blob/main/notebooks/weaviate-writer.ipynb) shows how to create an ETL pipeline with Sycamore to load a Weaviate vector database. It walks through how to use Sycamore to partition, extract, clean, chunk, embed, and load your data. You will need an [Aryn Cloud API key](https://www.aryn.ai/get-started) and an [OpenAI API key](https://platform.openai.com/signup) (for LLM-powered data enrichment and creating vector embeddings). At the time of writing, there are free trial or free tier options for all of these services. 4 | 5 | Run this tutorial [locally with Jupyter](https://github.com/aryn-ai/sycamore/blob/main/notebooks/weaviate-writer.ipynb). 6 | 7 | -------------------------------------------------------------------------------- /docs/source/sycamore/tutorials/etl_pinecone_tutorial.md: -------------------------------------------------------------------------------- 1 | # Loading Pinecone with Sycamore 2 | 3 | [This tutorial](https://colab.research.google.com/drive/1oWi50uqJafBDmLWNO4QFEbiotnU7o75B) is meant to show how to create an ETL pipeline with Sycamore to load a Pinecone vector database. It walks through an intermediate-level ETL flow: partitioning, extraction, cleaning, chunking, embedding, and loading. You will need an [Aryn Cloud API key](https://www.aryn.ai/get-started), [OpenAI API key](https://platform.openai.com/signup) (for LLM-powered data enrichment and creating vector embeddings), and a [Pinecone API key](https://app.pinecone.io/?sessionType=signup) (for creating and using a vector index). At the time of writing, there are free trial or free tier options for all of these services. 4 | 5 | Run this tutorial in a [Colab notebook](https://colab.research.google.com/drive/1oWi50uqJafBDmLWNO4QFEbiotnU7o75B) or [locally with Jupyter](https://github.com/aryn-ai/sycamore/blob/main/notebooks/sycamore-tutorial-intermediate-etl.ipynb). 6 | 7 | Once you have your data loaded in Pinecone, you can use Pinecone's query features for semantic search or a framework like Langchain for RAG. The [Pinecone Writer example notebook](https://github.com/aryn-ai/sycamore/blob/main/notebooks/pinecone-writer.ipynb) has sample Langchain code at the end of the notebook. 8 | -------------------------------------------------------------------------------- /docs/source/sycamore/using_jupyter.md: -------------------------------------------------------------------------------- 1 | # Using Jupyter notebooks 2 | 3 | Using a [Jupyter notebook](https://jupyter.org/) makes it easy to write and iterate on Sycamore ETL and data prep code. 4 | 5 | For instructions on how to install and configure Jupyter locally for Sycamore, [click here](../tutorials/sycamore-jupyter-dev-example.md#install-jupyter-locally). For an example script, [click here](https://github.com/aryn-ai/sycamore/blob/main/notebooks/jupyter_dev_example.ipynb). 6 | -------------------------------------------------------------------------------- /examples/markdown.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import sycamore 4 | from sycamore.transforms.partition import ArynPartitioner 5 | 6 | docs = ( 7 | sycamore.init(exec_mode=sycamore.EXEC_LOCAL) 8 | .read.binary(sys.argv[1:], binary_format="pdf") 9 | .partition(partitioner=ArynPartitioner(extract_table_structure=True, use_partitioning_service=False)) 10 | .markdown() 11 | .explode() 12 | .take() 13 | ) 14 | 15 | print(docs[1].text_representation) # doc zero isn't real 16 | -------------------------------------------------------------------------------- /examples/ndd_debug.py: -------------------------------------------------------------------------------- 1 | # An example script that exercises the near duplicate detection code. Useful for trying a larger 2 | # execution than the unit tests to look for memory usage problems. 3 | 4 | import sys 5 | import pyarrow.fs 6 | import os 7 | 8 | # ruff: noqa: E402 9 | root_dir = os.path.normpath(os.path.dirname(__file__) + "/..") 10 | sys.path.append(root_dir + "/lib/sycamore") 11 | 12 | import sycamore 13 | from sycamore.functions.tokenizer import HuggingFaceTokenizer 14 | from sycamore.transforms import COALESCE_WHITESPACE 15 | from sycamore.transforms.merge_elements import MarkedMerger 16 | from sycamore.transforms.partition import UnstructuredPdfPartitioner 17 | from sycamore.transforms.sketcher import SketchDebug 18 | 19 | 20 | paths = ["s3://aryn-public/ntsb/"] 21 | fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True) 22 | 23 | tokenizer = HuggingFaceTokenizer("thenlper/gte-small") 24 | 25 | ctx = sycamore.init(exec_mode=sycamore.EXEC_LOCAL) 26 | 27 | ds = ( 28 | ctx.read.binary(paths, binary_format="pdf", filesystem=fsys) 29 | .materialize("tmp/ndd_debug_read", source_mode=sycamore.MATERIALIZE_USE_STORED) 30 | .partition(partitioner=UnstructuredPdfPartitioner()) 31 | .regex_replace(COALESCE_WHITESPACE) 32 | .mark_bbox_preset(tokenizer=tokenizer) 33 | .merge(merger=MarkedMerger()) 34 | .spread_properties(["path", "title"]) 35 | .split_elements(tokenizer=tokenizer, max_tokens=512) 36 | .explode() 37 | .sketch() 38 | .transform(SketchDebug) 39 | ) 40 | 41 | res = ds.take_all() 42 | print(len(res)) 43 | -------------------------------------------------------------------------------- /examples/query/simple_ntsb.py: -------------------------------------------------------------------------------- 1 | from sycamore.query.client import SycamoreQueryClient 2 | from rich.console import Console 3 | 4 | console = Console() 5 | client = SycamoreQueryClient() 6 | 7 | 8 | OS_INDEX = "const_ntsb" 9 | QUERY = "How many airplane incidents were there in Washington in 2023?" 10 | 11 | schema = client.get_opensearch_schema(OS_INDEX) 12 | # console.print(schema) 13 | 14 | plan = client.generate_plan(QUERY, OS_INDEX, schema) 15 | # from sycamore.query.visualize import visualize_plan 16 | # visualize_plan(plan) 17 | 18 | # WARNING: As of 2024-09-03, the results are inconsistent; you can get different results 19 | # because of differences in the generated query plan and as a result of differences in the 20 | # processing of the pipeline. 21 | query_id, result = client.run_plan(plan) 22 | console.rule("Query result") 23 | console.print(result) 24 | -------------------------------------------------------------------------------- /examples/simple_duckdb.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import sycamore 4 | from sycamore.functions.tokenizer import HuggingFaceTokenizer 5 | from sycamore.llms.openai import OpenAI, OpenAIModels 6 | from sycamore.transforms import COALESCE_WHITESPACE 7 | from sycamore.transforms.merge_elements import MarkedMerger 8 | from sycamore.transforms.partition import SycamorePartitioner 9 | from sycamore.transforms.extract_entity import OpenAIEntityExtractor 10 | from sycamore.transforms.embed import SentenceTransformerEmbedder 11 | from sycamore.tests.config import TEST_DIR 12 | from simple_config import title_template 13 | 14 | sys.path.append("../sycamore") 15 | 16 | table_name = "duckdb_table" 17 | db_url = "tmp.db" 18 | model_name = "sentence-transformers/all-MiniLM-L6-v2" 19 | paths = str(TEST_DIR / "resources/data/pdfs/") 20 | davinci_llm = OpenAI(OpenAIModels.GPT_3_5_TURBO_INSTRUCT.value) 21 | 22 | tokenizer = HuggingFaceTokenizer(model_name) 23 | 24 | ctx = sycamore.init() 25 | 26 | ds = ( 27 | ctx.read.binary(paths, binary_format="pdf") 28 | .partition(partitioner=SycamorePartitioner()) 29 | .regex_replace(COALESCE_WHITESPACE) 30 | .extract_entity(entity_extractor=OpenAIEntityExtractor("title", llm=davinci_llm, prompt_template=title_template)) 31 | .mark_bbox_preset(tokenizer=tokenizer) 32 | .merge(merger=MarkedMerger()) 33 | .spread_properties(["path"]) 34 | .split_elements(tokenizer=tokenizer, max_tokens=512) 35 | .explode() 36 | .embed(embedder=SentenceTransformerEmbedder(model_name=model_name, batch_size=100)) 37 | ) 38 | ds_count = ds.count() 39 | ds.write.duckdb(table_name=table_name, db_url=db_url) 40 | -------------------------------------------------------------------------------- /lib/import_timer/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/import_timer/README.md -------------------------------------------------------------------------------- /lib/import_timer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "import_timer" 3 | version = "0.0.1" 4 | description = "Import timing library" 5 | authors = ["aryn.ai "] 6 | license = "Apache 2.0" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9" 11 | 12 | [build-system] 13 | requires = ["poetry-core"] 14 | build-backend = "poetry.core.masonry.api" 15 | -------------------------------------------------------------------------------- /lib/poetry-lock/README.md: -------------------------------------------------------------------------------- 1 | A fake meta-package to make it easier to force different poetry directories to lock to the same version. 2 | -------------------------------------------------------------------------------- /lib/poetry-lock/sycamore_poetry_lock/noop.py: -------------------------------------------------------------------------------- 1 | pass 2 | -------------------------------------------------------------------------------- /lib/remote-processors/Makefile: -------------------------------------------------------------------------------- 1 | help: 2 | @echo "install_rps: installs dependencies, builds proto, then installs the package" 3 | @echo "clean: clean up grpc-generated code (by deletion)" 4 | @echo "build_proto: generate code from the .proto files in protocols/proto-remote-processor" 5 | 6 | clean: 7 | -rm remote_processors/*pb2* 8 | 9 | build_proto: 10 | poetry run python -m grpc_tools.protoc -I opensearch-remote-processor/src/main/proto --python_out=remote_processors --pyi_out=remote_processors --grpc_python_out=remote_processors opensearch-remote-processor/src/main/proto/*.proto 11 | # Fix the relative imports 12 | poetry run protol --in-place --python-out remote_processors protoc --proto-path=opensearch-remote-processor/src/main/proto opensearch-remote-processor/src/main/proto/*.proto 13 | 14 | install_rps: 15 | poetry install --no-root 16 | make build_proto 17 | poetry install --only-root 18 | -------------------------------------------------------------------------------- /lib/remote-processors/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "remote-processors" 3 | version = "0.1.0" 4 | description = "A service for hosting search processors extrnal to opensearch" 5 | authors = ["HenryL27 "] 6 | readme = "README.md" 7 | 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9.2,<3.13" 11 | pyyaml = "^6.0.1" 12 | grpcio = "^1.60.0" 13 | cbor2 = "^5.6.0" 14 | sycamore-ai = "^0.1.13" 15 | 16 | [tool.poetry.group.test.dependencies] 17 | pytest = "^7.4" 18 | pytest-mock = "^3.12.0" 19 | docker = "^7.0.0" 20 | opensearch-py = "^2.4.2" 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | sycamore-ai = { path = "../sycamore", develop = true } 24 | sycamore-poetry-lock = { path = "../../lib/poetry-lock", develop = true } 25 | 26 | [tool.poetry.group.build.dependencies] 27 | grpcio-tools = "^1.60.1" 28 | protoletariat = "^3.2.19" 29 | 30 | [build-system] 31 | requires = ["poetry-core"] 32 | build-backend = "poetry.core.masonry.api" 33 | 34 | 35 | [tool.pytest.ini_options] 36 | markers = [ 37 | "processor_name: marks integ test with name of processor to test", 38 | ] 39 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .search_request import SearchRequest 2 | from .search_response import SearchResponse 3 | from . import processors 4 | from . import server 5 | 6 | __all__ = [ 7 | "SearchResponse", 8 | "SearchRequest", 9 | "processors", 10 | "server", 11 | ] 12 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from remote_processors.processors.processor import RequestProcessor, ResponseProcessor 2 | 3 | from remote_processors.processors.debug_processor import DebugResponseProcessor, DebugRequestProcessor 4 | from remote_processors.processors.dedup_processor import DedupResponseProcessor 5 | 6 | __all__ = [ 7 | "RequestProcessor", 8 | "ResponseProcessor", 9 | "DebugRequestProcessor", 10 | "DebugResponseProcessor", 11 | "DedupResponseProcessor", 12 | ] 13 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/processors/processor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from remote_processors import SearchRequest 4 | from remote_processors import SearchResponse 5 | 6 | 7 | class ResponseProcessor(ABC): 8 | @staticmethod 9 | @abstractmethod 10 | def from_config(configuration_chunk) -> "ResponseProcessor": 11 | raise NotImplementedError("abstract method `from_config` is not implemented") 12 | 13 | @abstractmethod 14 | def process_response(self, search_request: SearchRequest, search_response: SearchResponse) -> SearchResponse: 15 | raise NotImplementedError("abstract method `process_response` is not implemented") 16 | 17 | @staticmethod 18 | @abstractmethod 19 | def get_class_name() -> str: 20 | raise NotImplementedError("abstract method `get_class_name` is not implemented") 21 | 22 | 23 | class RequestProcessor(ABC): 24 | @staticmethod 25 | @abstractmethod 26 | def from_config(configuration_chunk) -> "RequestProcessor": 27 | raise NotImplementedError("abstract method `from_config` is not implemented") 28 | 29 | @abstractmethod 30 | def process_request(self, search_request: SearchRequest) -> SearchRequest: 31 | raise NotImplementedError("abstract method `process_request` is not implemented") 32 | 33 | @staticmethod 34 | @abstractmethod 35 | def get_class_name() -> str: 36 | raise NotImplementedError("abstract static method `get_class_name` is not implemented") 37 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/search_request.py: -------------------------------------------------------------------------------- 1 | from .search_request_pb2 import SearchRequest 2 | 3 | _ = SearchRequest 4 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/search_response.py: -------------------------------------------------------------------------------- 1 | from .search_response_pb2 import SearchResponse 2 | 3 | _ = SearchResponse 4 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/server/__init__.py: -------------------------------------------------------------------------------- 1 | from remote_processors.server.processor_registry import ProcessorRegistry 2 | from remote_processors.server.pipeline import Pipeline 3 | from remote_processors.server.remote_processor_service import RemoteProcessorService 4 | 5 | __all__ = [ 6 | "ProcessorRegistry", 7 | "Pipeline", 8 | "RemoteProcessorService", 9 | ] 10 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/remote-processors/remote_processors/test/__init__.py -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/integration/test_integ_debug.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.processor_name("debug") 5 | def test_debug(opensearch_client, upload_jsonl_index, singleton_pipeline): 6 | opensearch_client.search( 7 | index=upload_jsonl_index, params={"search_pipeline": singleton_pipeline}, body={"query": {"match_all": {}}} 8 | ) 9 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/resources/configs/malformed/dupe_pipeline_names.yml: -------------------------------------------------------------------------------- 1 | - debug: 2 | processors: 3 | - debug-response: 4 | prefix: "in debug processor" 5 | - debug-response: 6 | prefix: "in second debug processor" 7 | - debug-response: 8 | - debug: 9 | processors: 10 | - debug-response: 11 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/resources/configs/malformed/not_a_list.yml: -------------------------------------------------------------------------------- 1 | debug: 2 | processors: 3 | - debug-response: 4 | prefix: "in debug processor" 5 | - debug-response: 6 | prefix: "in second debug processor" 7 | - debug-response: 8 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/resources/configs/malformed/pipeline_not_a_map.yml: -------------------------------------------------------------------------------- 1 | - debug 2 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/resources/configs/malformed/pipeline_with_many_keys.yml: -------------------------------------------------------------------------------- 1 | - debug: 2 | processors: 3 | - debug-response: 4 | prefix: "in debug processor" 5 | - debug-response: 6 | prefix: "in second debug processor" 7 | - debug-response: 8 | another-pipeline: 9 | processors: 10 | - debug-response: 11 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/resources/configs/valid.yml: -------------------------------------------------------------------------------- 1 | - debug: 2 | processors: 3 | - debug-response: 4 | prefix: "in debug processor" 5 | - debug-response: 6 | prefix: "in second debug processor" 7 | - debug-response: 8 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/remote-processors/remote_processors/test/unit/__init__.py -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/unit/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/remote-processors/remote_processors/test/unit/processors/__init__.py -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/unit/processors/test_debug.py: -------------------------------------------------------------------------------- 1 | from remote_processors.processors.debug_processor import DebugResponseProcessor 2 | from remote_processors.test.utils import dummy_search_request, dummy_search_response 3 | 4 | 5 | class TestDebugProcessor: 6 | def test_debug_processor_does_not_modify_search_response(self): 7 | req = dummy_search_request() 8 | res = dummy_search_response() 9 | debug_processor = DebugResponseProcessor.from_config({}) 10 | processed = debug_processor.process_response(req, res) 11 | assert processed == res 12 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/unit/processors/test_library.py: -------------------------------------------------------------------------------- 1 | from remote_processors.server.processor_registry import ProcessorRegistry 2 | 3 | 4 | class TestProcessorLibrary: 5 | def test_that_processor_names_are_unique(self): 6 | """ 7 | The ProcessorRegistry performs this validation 8 | on its own at construction time so just use it here 9 | """ 10 | ProcessorRegistry() 11 | -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/unit/service/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/remote-processors/remote_processors/test/unit/service/__init__.py -------------------------------------------------------------------------------- /lib/remote-processors/remote_processors/test/unit/test_base.py: -------------------------------------------------------------------------------- 1 | from remote_processors.test.utils import dummy_search_request, dummy_search_response 2 | 3 | 4 | class TestBase: 5 | def test_that_testing_works(self): 6 | assert True 7 | 8 | def test_that_utils_work(self): 9 | dummy_search_response() 10 | dummy_search_request() 11 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/README.md: -------------------------------------------------------------------------------- 1 | # Execution 2 | 3 | Execution is the layer taking care of execution plans, we inherit the similar 4 | concept from database. The core of each operator is the `execute` method which 5 | triggers the physical layer execution. The physical layer is currently built 6 | on top of ray Dataset. Dataset itself has an Execution layer, it focuses more 7 | on optimization like fusing pipelining tasks to avoid ray scheduling overhead. 8 | 9 | This has advantages in couple dimensions: 10 | 11 | 1. Show a clear lineage of execution 12 | 2. Lazy execution gives opportunities for preprocessing execution plans, e.g. 13 | adjusting partition size, modifying ray remote parameters to make it easier 14 | to fuse. 15 | 16 | Execution are basically categorized into scans, transforms and writes. 17 | 18 | ## Scans 19 | Taking care of reading data from different data sources. 20 | 21 | ## Transforms 22 | 23 | ## Writes 24 | 25 | ## Kernels 26 | Kernels are low-level primitives which are executed as task/actor in worker 27 | nodes. We extract these blocks out from operators just for potential easier 28 | reuse purpose. 29 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.context import init, shutdown, Context, ExecMode 2 | from sycamore.docset import DocSet 3 | from sycamore.executor import Execution 4 | from sycamore.materialize_config import MaterializeSourceMode 5 | 6 | EXEC_RAY = ExecMode.RAY 7 | EXEC_LOCAL = ExecMode.LOCAL 8 | MATERIALIZE_RECOMPUTE = MaterializeSourceMode.RECOMPUTE 9 | MATERIALIZE_USE_STORED = MaterializeSourceMode.USE_STORED 10 | 11 | __all__ = [ 12 | "DocSet", 13 | "init", 14 | "shutdown", 15 | "Context", 16 | "Execution", 17 | "ExecMode", 18 | "MaterializeSourceMode", 19 | ] 20 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/doc_reconstruct.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from sycamore.data import Document 4 | 5 | 6 | class DocumentReconstructor: 7 | def __init__(self, index_name: str, reconstruct_fn: Callable[[str, str], Document]): 8 | self.index_name = index_name 9 | self.reconstruct_fn = reconstruct_fn 10 | 11 | def get_required_source_fields(self) -> list[str]: 12 | return ["parent_id"] 13 | 14 | def get_doc_id(self, data: dict) -> str: 15 | return data["_source"]["parent_id"] or data["_id"] 16 | 17 | def reconstruct(self, data: dict) -> Document: 18 | return self.reconstruct_fn(self.index_name, self.get_doc_id(data)) 19 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/duckdb/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.duckdb.duckdb_reader import DuckDBReader, DuckDBReaderClientParams, DuckDBReaderQueryParams 2 | from sycamore.connectors.duckdb.duckdb_writer import ( 3 | DuckDBWriter, 4 | DuckDBWriterClientParams, 5 | DuckDBWriterTargetParams, 6 | ) 7 | 8 | __all__ = [ 9 | "DuckDBWriter", 10 | "DuckDBWriterClientParams", 11 | "DuckDBWriterTargetParams", 12 | "DuckDBReader", 13 | "DuckDBReaderClientParams", 14 | "DuckDBReaderQueryParams", 15 | ] 16 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/elasticsearch/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.elasticsearch.elasticsearch_writer import ( 2 | ElasticsearchWriterClient, 3 | ElasticsearchDocumentWriter, 4 | ElasticsearchWriterClientParams, 5 | ElasticsearchWriterTargetParams, 6 | ) 7 | from sycamore.connectors.elasticsearch.elasticsearch_reader import ( 8 | ElasticsearchReaderClient, 9 | ElasticsearchReader, 10 | ElasticsearchReaderClientParams, 11 | ElasticsearchReaderQueryParams, 12 | ) 13 | 14 | __all__ = [ 15 | "ElasticsearchWriterClient", 16 | "ElasticsearchDocumentWriter", 17 | "ElasticsearchWriterClientParams", 18 | "ElasticsearchWriterTargetParams", 19 | "ElasticsearchReaderClient", 20 | "ElasticsearchReader", 21 | "ElasticsearchReaderClientParams", 22 | "ElasticsearchReaderQueryParams", 23 | ] 24 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/file/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.file.file_scan import BinaryScan, FileScan, JsonScan, JsonDocumentScan 2 | from sycamore.connectors.file.materialized_scan import ArrowScan, DocScan, MaterializedScan, PandasScan 3 | from sycamore.connectors.file.file_writer import FileWriter 4 | 5 | __all__ = [ 6 | "ArrowScan", 7 | "BinaryScan", 8 | "DocScan", 9 | "FileScan", 10 | "JsonScan", 11 | "JsonDocumentScan", 12 | "MaterializedScan", 13 | "PandasScan", 14 | "FileWriter", 15 | ] 16 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/neo4j/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.neo4j.neo4j_writer import ( 2 | Neo4jWriterClientParams, 3 | Neo4jWriterTargetParams, 4 | Neo4jValidateParams, 5 | Neo4jPrepareCSV, 6 | Neo4jWriteCSV, 7 | Neo4jLoadCSV, 8 | ) 9 | 10 | __all__ = [ 11 | "Neo4jWriterClientParams", 12 | "Neo4jWriterTargetParams", 13 | "Neo4jValidateParams", 14 | "Neo4jPrepareCSV", 15 | "Neo4jWriteCSV", 16 | "Neo4jLoadCSV", 17 | ] 18 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/opensearch/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.opensearch.opensearch_reader import ( 2 | OpenSearchReader, 3 | OpenSearchReaderClientParams, 4 | OpenSearchReaderQueryParams, 5 | ) 6 | from sycamore.connectors.opensearch.opensearch_writer import ( 7 | OpenSearchWriter, 8 | OpenSearchWriterClientParams, 9 | OpenSearchWriterTargetParams, 10 | OpenSearchWriterRecord, 11 | OpenSearchWriterClient, 12 | ) 13 | 14 | __all__ = [ 15 | "OpenSearchReader", 16 | "OpenSearchReaderClientParams", 17 | "OpenSearchReaderQueryParams", 18 | "OpenSearchWriter", 19 | "OpenSearchWriterClientParams", 20 | "OpenSearchWriterTargetParams", 21 | "OpenSearchWriterRecord", 22 | "OpenSearchWriterClient", 23 | ] 24 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/pinecone/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.pinecone.pinecone_writer import ( 2 | PineconeWriter, 3 | PineconeWriterClientParams, 4 | PineconeWriterTargetParams, 5 | PineconeWriterClient, 6 | ) 7 | from sycamore.connectors.pinecone.pinecone_reader import ( 8 | PineconeReader, 9 | PineconeReaderClientParams, 10 | PineconeReaderQueryParams, 11 | PineconeReaderQueryResponse, 12 | ) 13 | 14 | __all__ = [ 15 | "PineconeWriter", 16 | "PineconeWriterClientParams", 17 | "PineconeWriterTargetParams", 18 | "PineconeWriterClient", 19 | "PineconeReader", 20 | "PineconeReaderClientParams", 21 | "PineconeReaderQueryParams", 22 | "PineconeReaderQueryResponse", 23 | ] 24 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/qdrant/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.qdrant.qdrant_writer import ( 2 | QdrantWriter, 3 | QdrantWriterClientParams, 4 | QdrantWriterTargetParams, 5 | QdrantWriterClient, 6 | ) 7 | from sycamore.connectors.qdrant.qdrant_reader import ( 8 | QdrantReader, 9 | QdrantReaderClientParams, 10 | QdrantReaderQueryParams, 11 | QdrantReaderQueryResponse, 12 | ) 13 | 14 | __all__ = [ 15 | "QdrantWriter", 16 | "QdrantWriterClientParams", 17 | "QdrantWriterTargetParams", 18 | "QdrantWriterClient", 19 | "QdrantReader", 20 | "QdrantReaderClientParams", 21 | "QdrantReaderQueryParams", 22 | "QdrantReaderQueryResponse", 23 | ] 24 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/connectors/weaviate/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.weaviate.weaviate_writer import ( 2 | WeaviateDocumentWriter, 3 | WeaviateCrossReferenceWriter, 4 | WeaviateClientParams, 5 | WeaviateWriterTargetParams, 6 | ) 7 | from sycamore.connectors.weaviate.weaviate_reader import ( 8 | WeaviateReader, 9 | WeaviateReaderQueryParams, 10 | WeaviateReaderClientParams, 11 | ) 12 | 13 | __all__ = [ 14 | "WeaviateDocumentWriter", 15 | "WeaviateCrossReferenceWriter", 16 | "WeaviateClientParams", 17 | "WeaviateWriterTargetParams", 18 | "WeaviateReader", 19 | "WeaviateReaderQueryParams", 20 | "WeaviateReaderClientParams", 21 | ] 22 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/data/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.data.bbox import BoundingBox 2 | from sycamore.data.table import Table, TableCell 3 | from sycamore.data.element import Element, ImageElement, TableElement 4 | from sycamore.data.document import ( 5 | Document, 6 | MetadataDocument, 7 | HierarchicalDocument, 8 | OpenSearchQuery, 9 | OpenSearchQueryResult, 10 | ) 11 | from sycamore.data.docid import ( 12 | docid_nanoid_chars, 13 | docid_to_uuid, 14 | mkdocid, 15 | nanoid36, 16 | uuid_to_docid, 17 | ) 18 | 19 | 20 | __all__ = [ 21 | "BoundingBox", 22 | "Document", 23 | "MetadataDocument", 24 | "HierarchicalDocument", 25 | "Element", 26 | "ImageElement", 27 | "TableElement", 28 | "OpenSearchQuery", 29 | "OpenSearchQueryResult", 30 | "Table", 31 | "TableCell", 32 | "docid_nanoid_chars", 33 | "docid_to_uuid", 34 | "mkdocid", 35 | "nanoid36", 36 | "uuid_to_docid", 37 | ] 38 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/data/metadata.py: -------------------------------------------------------------------------------- 1 | from sycamore.data import MetadataDocument 2 | from sycamore.utils.thread_local import ThreadLocalAccess, ADD_METADATA_TO_OUTPUT 3 | 4 | 5 | def add_metadata(**metadata): 6 | ThreadLocalAccess(ADD_METADATA_TO_OUTPUT).get().append(MetadataDocument(**metadata)) 7 | 8 | 9 | # At some point we should define particular forms of metadata like metrics 10 | # Maybe following https://github.com/prometheus/OpenMetrics/blob/main/specification/OpenMetrics.md 11 | # as a structure for the metrics -- too complex for now. 12 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/decorators.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | 4 | def experimental(cls): 5 | """ 6 | Decorator to mark a class as experimental. 7 | """ 8 | 9 | def wrapper(*args, **kwargs): 10 | warnings.warn( 11 | f"Class {cls.__name__} is experimental and may change in the future.", FutureWarning, stacklevel=2 12 | ) 13 | return cls(*args, **kwargs) 14 | 15 | return wrapper 16 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.evaluation.data import EvaluationDataPoint, EvaluationMetric 2 | 3 | __all__ = ["EvaluationDataPoint", "EvaluationMetric"] 4 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/evaluation/datasets.py: -------------------------------------------------------------------------------- 1 | import typing 2 | from typing import Union, Callable, Any 3 | 4 | from ray.data import Dataset, from_huggingface 5 | 6 | from sycamore.evaluation import EvaluationDataPoint 7 | from sycamore import DocSet, Context 8 | from sycamore.connectors.file import MaterializedScan 9 | 10 | if typing.TYPE_CHECKING: 11 | from datasets import IterableDataset 12 | 13 | 14 | class HuggingFaceScan(MaterializedScan): 15 | def __init__( 16 | self, 17 | dataset: Union[Dataset, "IterableDataset"], 18 | doc_extractor: Callable[[dict[str, Any]], dict[str, EvaluationDataPoint]], 19 | **resource_args 20 | ): 21 | super().__init__(**resource_args) 22 | self._dataset = dataset 23 | self._doc_extractor = doc_extractor 24 | 25 | def execute(self, **kwargs) -> Dataset: 26 | ray_ds = from_huggingface(self._dataset) 27 | processed = ray_ds.map(self._doc_extractor) 28 | return processed 29 | 30 | def format(self): 31 | return "huggingface" 32 | 33 | 34 | class EvaluationDataSetReader: 35 | def __init__(self, context: Context) -> None: 36 | super().__init__() 37 | self._context = context 38 | 39 | def huggingface( 40 | self, 41 | dataset: Union[Dataset, "IterableDataset"], 42 | doc_extractor: Callable[[dict[str, Any]], dict[str, EvaluationDataPoint]], 43 | **resource_args 44 | ) -> DocSet: 45 | json_scan = HuggingFaceScan(dataset=dataset, doc_extractor=doc_extractor, **resource_args) 46 | return DocSet(self._context, json_scan) 47 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.evaluation.metrics.generated_answer import rouge_metrics 2 | from sycamore.evaluation.metrics.retrieval import document_retrieval_metrics 3 | 4 | __all__ = ["document_retrieval_metrics", "rouge_metrics"] 5 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/evaluation/metrics/generated_answer.py: -------------------------------------------------------------------------------- 1 | from sycamore.evaluation import EvaluationDataPoint, EvaluationMetric 2 | from sycamore.utils.import_utils import requires_modules 3 | 4 | 5 | class RougeMetrics(EvaluationMetric): 6 | @requires_modules("rouge", extra="eval") 7 | def __init__(self, rouge_metrics_types=None) -> None: 8 | from rouge import Rouge 9 | 10 | super().__init__() 11 | if rouge_metrics_types is None: 12 | rouge_metrics_types = ["rouge-1", "rouge-2", "rouge-l"] 13 | self._rouge_evaluator = Rouge(metrics=rouge_metrics_types) 14 | 15 | def metric_name(self) -> str: 16 | return "GeneratedAnswerMetrics" 17 | 18 | def evaluate(self, datapoint: EvaluationDataPoint) -> dict[str, str]: 19 | scores = self._rouge_evaluator.get_scores(datapoint.generated_answer, datapoint.ground_truth_answer)[0] 20 | result = { 21 | "rouge-1": scores["rouge-1"]["f"], 22 | "rouge-2": scores["rouge-2"]["f"], 23 | "rouge-l": scores["rouge-l"]["f"], 24 | } 25 | return result 26 | 27 | 28 | rouge_metrics = RougeMetrics() 29 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.functions.elements import reorder_elements, filter_elements 2 | from sycamore.functions.chunker import Chunker, TextOverlapChunker 3 | from sycamore.functions.document import split_and_convert_to_image, DrawBoxes 4 | from sycamore.functions.tokenizer import Tokenizer, CharacterTokenizer, HuggingFaceTokenizer, OpenAITokenizer 5 | 6 | __all__ = [ 7 | "reorder_elements", 8 | "filter_elements", 9 | "Chunker", 10 | "TextOverlapChunker", 11 | "split_and_convert_to_image", 12 | "DrawBoxes", 13 | "Tokenizer", 14 | "CharacterTokenizer", 15 | "HuggingFaceTokenizer", 16 | "OpenAITokenizer", 17 | ] 18 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/functions/elements.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Any, Callable, Optional 3 | 4 | from sycamore.data import Document, Element 5 | 6 | 7 | def reorder_elements( 8 | document: Document, 9 | *, 10 | comparator: Optional[Callable[[Element, Element], int]] = None, 11 | key: Optional[Callable[[Element], Any]] = None, 12 | ) -> Document: 13 | """Reorders the elements. Must supply comparator or key. 14 | 15 | Args: 16 | document: Document for which the elements need to be re-ordered 17 | comparator: A comparator function 18 | key: A key as per sorted() 19 | 20 | Returns: 21 | Document with elements re-ordered 22 | """ 23 | if key: 24 | assert not comparator, "passed both comparator and key" 25 | else: 26 | assert comparator, "passed neither comparator nor key" 27 | key = functools.cmp_to_key(comparator) 28 | elements = document.elements 29 | elements.sort(key=key) 30 | document.elements = elements 31 | return document 32 | 33 | 34 | def filter_elements(document: Document, filter_function: Callable[[Element], bool]) -> list[Element]: 35 | """Filters the elements. 36 | 37 | Args: 38 | document: Document for which the elements need to be filtered 39 | filter_function: A filter function 40 | 41 | Returns: 42 | List of filtered elements 43 | """ 44 | elements = document.elements 45 | return list(filter(filter_function, elements)) 46 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/llms/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 2 | 3 | from sycamore.llms.prompts import default_prompts 4 | 5 | from sycamore.llms.prompts.default_prompts import ( 6 | SimplePrompt, 7 | EntityExtractorZeroShotJinjaPrompt, 8 | EntityExtractorFewShotJinjaPrompt, 9 | TextSummarizerGuidancePrompt, 10 | SchemaZeroShotJinjaPrompt, 11 | PropertiesZeroShotGuidancePrompt, 12 | TaskIdentifierZeroShotGuidancePrompt, 13 | GraphEntityExtractorPrompt, 14 | GraphRelationshipExtractorPrompt, 15 | ExtractTablePropertiesPrompt, 16 | ) 17 | from sycamore.llms.prompts.default_prompts import _deprecated_prompts 18 | from sycamore.llms.prompts.prompts import ( 19 | RenderedPrompt, 20 | RenderedMessage, 21 | SycamorePrompt, 22 | ElementListPrompt, 23 | ElementPrompt, 24 | StaticPrompt, 25 | ) 26 | 27 | prompts = [ 28 | "SimplePrompt", 29 | "EntityExtractorZeroShotJinjaPrompt", 30 | "EntityExtractorFewShotJinjaPrompt", 31 | "TextSummarizerGuidancePrompt", 32 | "SchemaZeroShotJinjaPrompt", 33 | "PropertiesZeroShotGuidancePrompt", 34 | "GraphEntityExtractorPrompt", 35 | "GraphRelationshipExtractorPrompt", 36 | "ExtractTablePropertiesPrompt", 37 | ] + list(_deprecated_prompts.keys()) 38 | 39 | _all = prompts + [ 40 | "RenderedPrompt", 41 | "RenderedMessage", 42 | "SycamorePrompt", 43 | "ElementListPrompt", 44 | "ElementPrompt", 45 | "StaticPrompt", 46 | ] 47 | 48 | __all__ = _all 49 | 50 | 51 | def __getattr__(name): 52 | if name in _deprecated_prompts: 53 | return getattr(default_prompts, name) 54 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 55 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/query/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/execution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/query/execution/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/clustering.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import Field 4 | 5 | from sycamore.query.logical_plan import Node 6 | 7 | 8 | class KMeanClustering(Node): 9 | """Group documents based on a particular field. 10 | 11 | Returns a database with ONLY 2 FIELDS: "properties.key" (which corresponds to unique values of 12 | *field*) and "properties.count" (which contains the counts corresponding to unique values 13 | of *field*). 14 | """ 15 | 16 | field: Optional[str] = None 17 | """The database field to find the top K occurences for.""" 18 | 19 | new_field: str = "centroids" 20 | """The centroid field used for clustering""" 21 | 22 | K: Optional[int] = None 23 | """The number of groups.""" 24 | 25 | 26 | class LLMClustering(Node): 27 | """Group documents based on a particular field. 28 | 29 | Returns a database with ONLY 2 FIELDS: "properties.key" (which corresponds to unique values of 30 | *field*) and "properties.count" (which contains the counts corresponding to unique values 31 | of *field*). 32 | """ 33 | 34 | field: str 35 | """The database field to find the top K occurences for.""" 36 | 37 | new_field: str = "_autogen_ClusterAssignment" 38 | """The field for cluster or group assignment""" 39 | 40 | llm_group_instruction: Optional[str] = Field(default=None, json_schema_extra={"exclude_from_comparison": True}) 41 | """An instruction of what the groups should be about E.g. if the 42 | purpose of this operation is to find the top 2 most frequent cities, llm_cluster_instruction 43 | could be 'Form groups of different cities'""" 44 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/count.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from sycamore.query.logical_plan import Node 4 | 5 | 6 | class Count(Node): 7 | """Returns a count of the number of database records provided as input. Optionally supports 8 | a distinct_field parameter to count the number of distinct values of a given field. For example, 9 | if distinct_field is 'incident_id', the count will return the number of unique incident_id values 10 | in the input database records. Otherwise, the count will return the total number of input records. 11 | 12 | Note that you almost always want to use distinct_field, unless you are certain that each 13 | of the input records represents a unique entity that you wish to count. 14 | 15 | Returns a number. 16 | """ 17 | 18 | distinct_field: Optional[str] = None 19 | """If specified, returns the count of distinct values of this field in the input. 20 | If unspecified, returns the count of all input records. 21 | """ 22 | 23 | @property 24 | def output_type(self) -> type: 25 | return int 26 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/field_in.py: -------------------------------------------------------------------------------- 1 | from sycamore.query.logical_plan import Node 2 | 3 | 4 | class FieldIn(Node): 5 | """Joins two databases based on a particular field. 6 | 7 | Values of *field_one* from 8 | database 1 are used to filter records of database 2 based on values of *field_two* 9 | in database 2. For example, consider that database 1 is {"properties.key": 10 | ['Cruise Ship', 'Sailboat'], "properties.count": [3, 2]} and database 2 is 11 | {"properties.entity.shipType": ['Jet ski', 'Canoe', 'Submarine', 'Cruise Ship'], 12 | "properties.entity.country": ['Australia', 'Japan', 'United States', 'Mexico'], 13 | "properties.entity.city": ['Sydney', 'Kyoto', 'San Francisco', 'Cabo']}. A join 14 | operation with *inputs* containing ids of operations that return database 1 and 15 | database 2, respectively, *field_one* being "properties.key", and *field_two* being 16 | "properties.entity.shipType", would return the database {"properties.entity.shipType": 17 | ['Cruise Ship'], "properties.entity.country": ['Mexico'], "properties.entity.city": 18 | ['Cabo']}. 19 | 20 | Returns a database with fields identical to those in database 2. 21 | """ 22 | 23 | field_one: str 24 | """The field name in the first database to join on.""" 25 | 26 | field_two: str 27 | """The field name in the second database to join on.""" 28 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/groupby.py: -------------------------------------------------------------------------------- 1 | from sycamore.query.logical_plan import Node 2 | 3 | 4 | class GroupBy(Node): 5 | """Group documents based on a particular field. 6 | 7 | Returns a database with ONLY 2 FIELDS: "properties.key" (which corresponds to unique values of 8 | *field*) and "properties.count" (which contains the counts corresponding to unique values 9 | of *field*). 10 | """ 11 | 12 | field: str = "properties._autogen_ClusterAssignment" 13 | """The centroid field used for clustering""" 14 | 15 | 16 | class AggregateCount(Node): 17 | """Group documents based on a particular field. 18 | 19 | Returns a database with ONLY 2 FIELDS: "properties.key" (which corresponds to unique values of 20 | *field*) and "properties.count" (which contains the counts corresponding to unique values 21 | of *field*). 22 | """ 23 | 24 | 25 | class AggregateCollect(Node): 26 | """Group documents based on a particular field. 27 | 28 | Returns a database with ONLY 2 FIELDS: "properties.key" (which corresponds to unique values of 29 | *field*) and "properties.count" (which contains the counts corresponding to unique values 30 | of *field*). 31 | """ 32 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/limit.py: -------------------------------------------------------------------------------- 1 | from sycamore.query.logical_plan import Node 2 | 3 | 4 | class Limit(Node): 5 | """Limits a database to the first num_records records. 6 | 7 | Returns a database. 8 | """ 9 | 10 | num_records: int 11 | """The number of records of the database to return.""" 12 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/llm_extract_entity.py: -------------------------------------------------------------------------------- 1 | from pydantic import Field 2 | 3 | from sycamore.query.logical_plan import Node 4 | 5 | 6 | class LlmExtractEntity(Node): 7 | """Adds a new field to the input database based on extracting information from an 8 | existing field. 9 | 10 | Returns a database. 11 | """ 12 | 13 | question: str = Field(..., json_schema_extra={"exclude_from_comparison": True}) 14 | """The prompt to the LLM for creating the new field. Be descriptive with the question and 15 | include examples if possible.""" 16 | 17 | field: str 18 | """The name of the existing field for the LLM to use.""" 19 | 20 | new_field: str = Field(..., json_schema_extra={"exclude_from_comparison": True}) 21 | """The name of the new field to add.""" 22 | 23 | new_field_type: str 24 | """The type of the new field, e.g. int or string.""" 25 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/math.py: -------------------------------------------------------------------------------- 1 | from pydantic import Field 2 | 3 | from sycamore.query.logical_plan import Node 4 | 5 | 6 | class Math(Node): 7 | """ 8 | Performs an arithmetic operation on two input numbers. 9 | 10 | Returns a number. 11 | """ 12 | 13 | operation: str = Field(pattern="^add$|^subtract$|^multiply$|^divide$") 14 | """The arithmetic operation to perform on the inputs. Options are "add", "subtract", 15 | "multiply", or "divide".""" 16 | 17 | @property 18 | def input_types(self) -> set[type]: 19 | return {int, float} 20 | 21 | @property 22 | def output_type(self) -> type: 23 | return float # note: this can be an integer too, we're just using a compatible type here 24 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/sort.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from sycamore.query.logical_plan import Node 4 | 5 | 6 | class Sort(Node): 7 | """Sorts a database based on the value of a field. 8 | 9 | Returns a database. 10 | """ 11 | 12 | descending: bool = False 13 | """Determines whether to sort in descending order (greatest value first).""" 14 | 15 | field: str 16 | """The name of the database field to sort based on.""" 17 | 18 | default_value: Any = None 19 | """The default value used when sorting if a document is missing the specified field.""" 20 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/summarize_data.py: -------------------------------------------------------------------------------- 1 | from pydantic import Field 2 | 3 | from sycamore import DocSet 4 | from sycamore.query.logical_plan import Node 5 | 6 | 7 | class SummarizeData(Node): 8 | """ 9 | This operation generates an English response to a user query based on the input data provided. 10 | 11 | The response should be in Markdown format. It can contain links, tables, and other 12 | Markdown elements. 13 | 14 | Whenever possible, provide links to relevant data sources and documents. 15 | """ 16 | 17 | question: str = Field(..., json_schema_extra={"exclude_from_comparison": True}) 18 | """The question to ask the LLM.""" 19 | 20 | @property 21 | def input_types(self) -> set[type]: 22 | return {DocSet, float, int, str} 23 | 24 | @property 25 | def output_type(self) -> type: 26 | return str 27 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/query/operators/unroll.py: -------------------------------------------------------------------------------- 1 | from sycamore.query.logical_plan import Node 2 | 3 | 4 | class Unroll(Node): 5 | """Unroll based on a particular field.""" 6 | 7 | field: str 8 | """The field to be unrolled""" 9 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/rules/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.rules.optimize_resource_args import Rule, EnforceResourceUsage, OptimizeResourceArgs 2 | 3 | __all__ = ["Rule", "EnforceResourceUsage", "OptimizeResourceArgs"] 4 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/rules/optimize_resource_args.py: -------------------------------------------------------------------------------- 1 | from sycamore.plan_nodes import Node, NonCPUUser, NonGPUUser, SingleThreadUser 2 | 3 | 4 | class Rule: 5 | def __call__(self, plan: Node) -> Node: 6 | raise NotImplementedError 7 | 8 | 9 | class EnforceResourceUsage(Rule): 10 | def __call__(self, plan: Node) -> Node: 11 | if isinstance(plan, NonCPUUser): 12 | plan.resource_args["num_cpus"] = 0 13 | 14 | if isinstance(plan, SingleThreadUser) and "num_cpus" not in plan.resource_args: 15 | plan.resource_args["num_cpus"] = 1 16 | 17 | if isinstance(plan, NonGPUUser): 18 | assert "num_gpus" not in plan.resource_args 19 | 20 | return plan 21 | 22 | 23 | class OptimizeResourceArgs(Rule): 24 | def __call__(self, plan: Node) -> Node: 25 | return plan 26 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/schema.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Any 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class SchemaField(BaseModel): 7 | """Represents a field in a DocSet schema.""" 8 | 9 | name: str 10 | """The name of the field.""" 11 | 12 | field_type: str 13 | """The type of the field.""" 14 | 15 | default: Optional[Any] = None 16 | """The default value for the field.""" 17 | 18 | description: Optional[str] = None 19 | """A natural language description of the field.""" 20 | 21 | examples: Optional[list[Any]] = None 22 | """A list of example values for the field.""" 23 | 24 | 25 | class Schema(BaseModel): 26 | """Represents the schema of a DocSet.""" 27 | 28 | fields: list[SchemaField] 29 | """A list of fields belong to this schema.""" 30 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/README.md: -------------------------------------------------------------------------------- 1 | # Test 2 | 3 | We use [pytest](https://docs.pytest.org/) as test framework, and 4 | [pytest-mock](https://pytest-mock.readthedocs.io/en/latest/usage.html) as 5 | mocking. 6 | 7 | Tests are split into integration/unit based on the criteria whether external 8 | resources are required. We organize integration and unit tests into two sub 9 | folders. This avoids marking tests with attributes or @pytest.mark manually, 10 | also tests are not limited to a specific test runner. 11 | 12 | ## Run Tests 13 | 14 | To run all integration tests: 15 | ``` 16 | python -m pytest sycamore/tests/integration 17 | ``` 18 | To run all unit tests: 19 | ``` 20 | python -m pytest sycamore/tests/unit 21 | ``` 22 | To run all tests in a single test file 23 | ``` 24 | python -m pytest sycamore/tests/unit/test_writer.py 25 | ``` 26 | To run all tests in a single test class 27 | ``` 28 | python -m pytest sycamore/tests/unit/execution/transforms/test_partition.py::TestPartition 29 | ``` 30 | To run a single test method of a test class 31 | ``` 32 | python -m pytest sycamore/tests/unit/execution/transforms/test_partition.py::TestPartition::test_pdf_partition 33 | ``` 34 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | TEST_DIR = Path(__file__).parent 4 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyarrow.fs import LocalFileSystem 3 | 4 | from sycamore import ExecMode 5 | from sycamore.data.document import Document 6 | 7 | 8 | @pytest.fixture 9 | def read_local_binary(request) -> Document: 10 | local = LocalFileSystem() 11 | path = str(request.param) 12 | input_stream = local.open_input_stream(path) 13 | document = Document() 14 | document.binary_representation = input_stream.readall() 15 | document.properties["path"] = path 16 | return document 17 | 18 | 19 | @pytest.fixture(params=(exec_mode for exec_mode in ExecMode if exec_mode != ExecMode.UNKNOWN)) 20 | def exec_mode(request): 21 | """ 22 | Use this to run a test against all available execution modes. You will need to pass this as a parameter to 23 | the Context initialization. e.g. 24 | 25 | Example: 26 | .. code-block:: python 27 | 28 | def test_example(exec_mode): 29 | context = sycamore.init(exec_mode=exec_mode) 30 | ... 31 | """ 32 | return request.param 33 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/connectors/aryn/test_aryn_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/integration/connectors/aryn/test_aryn_reader.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/connectors/aryn/test_aryn_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/integration/connectors/aryn/test_aryn_writer.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/connectors/aryn/test_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from sycamore.connectors.aryn.client import ArynClient 6 | 7 | 8 | aryn_endpoint = os.getenv("ARYN_ENDPOINT") 9 | 10 | 11 | @pytest.mark.skip(reason="For manual testing only") 12 | def test_list_docs(): 13 | aryn_api_key = os.getenv("ARYN_TEST_API_KEY") 14 | client = ArynClient(aryn_url=f"{aryn_endpoint}", api_key=aryn_api_key) 15 | docset_id = "" 16 | docs = client.list_docs(docset_id) 17 | for doc in docs: 18 | print(doc) 19 | 20 | 21 | @pytest.mark.skip(reason="For manual testing only") 22 | def test_get_doc(): 23 | aryn_api_key = os.getenv("ARYN_TEST_API_KEY") 24 | client = ArynClient(aryn_url=f"{aryn_endpoint}", api_key=aryn_api_key) 25 | docset_id = "" 26 | docs = client.list_docs(docset_id) 27 | for doc in docs: 28 | print(doc) 29 | doc = client.get_doc(docset_id, doc) 30 | print(doc) 31 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/connectors/common.py: -------------------------------------------------------------------------------- 1 | from sycamore.data.document import DocumentPropertyTypes, DocumentSource, Document 2 | from sycamore.connectors.common import compare_docs 3 | 4 | 5 | def compare_connector_docs( 6 | gt_docs: list[Document], returned_docs: list[Document], parent_offset: int = 0, doc_reconstruct: bool = False 7 | ): 8 | assert len(gt_docs) == (len(returned_docs) + parent_offset) 9 | 10 | if not doc_reconstruct: 11 | for doc in gt_docs: 12 | doc.properties[DocumentPropertyTypes.SOURCE] = DocumentSource.DB_QUERY 13 | 14 | gt_dict = {doc.doc_id: doc for doc in gt_docs} 15 | returned_dict = {doc.doc_id: doc for doc in returned_docs} 16 | 17 | # Find any unmatched doc_ids 18 | gt_ids = set(gt_dict.keys()) 19 | returned_ids = set(returned_dict.keys()) 20 | missing_from_returned = gt_ids - returned_ids 21 | extra_in_returned = returned_ids - gt_ids 22 | assert len(extra_in_returned) == 0 23 | # Compare all matched documents 24 | assert all(compare_docs(gt_dict[doc_id], returned_dict[doc_id]) for doc_id in gt_ids.intersection(returned_ids)) 25 | if missing_from_returned: 26 | assert len(missing_from_returned) == parent_offset 27 | for missing_doc_id in missing_from_returned: 28 | assert (gt_doc := gt_dict.get(missing_doc_id)) and not gt_doc.parent_id # is a parent document 29 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/connectors/file/test_file_writer.py: -------------------------------------------------------------------------------- 1 | import sycamore 2 | from sycamore.tests.unit.connectors.file.test_file_writer import impl_test_json_bytes_with_bbox_image 3 | 4 | 5 | def test_json_bytes_with_bbox_image(): 6 | impl_test_json_bytes_with_bbox_image(sycamore.EXEC_RAY) 7 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/evaluation/test_evaluate.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sycamore.evaluation.evaluate import QualityAssessment, Evaluate 3 | import sycamore 4 | 5 | 6 | class TestTransformEvaluate: 7 | INDEX = "" 8 | 9 | OS_CLIENT_ARGS = { 10 | "hosts": [{"host": "localhost", "port": 9200}], 11 | "http_compress": True, 12 | "http_auth": ("admin", "admin"), 13 | "use_ssl": False, 14 | "verify_certs": False, 15 | "ssl_assert_hostname": False, 16 | "ssl_show_warn": False, 17 | "timeout": 120, 18 | } 19 | 20 | OS_CONFIG = { 21 | "size": 10, 22 | "neural_search_k": 100, 23 | "embedding_model_id": "SE1lDZABqmytCSGjsh1L", 24 | "search_pipeline": "hybrid_rag_pipeline", 25 | "llm": "gpt-4-turbo", 26 | "context_window": "5", 27 | } 28 | 29 | @pytest.mark.skip(reason="Requires named models to configure os pipeline unless we setup the cluster on each run") 30 | def test_pipeline(self): 31 | context = sycamore.init() 32 | custom_question_augmentation = "{}, The product code is {}." 33 | question_augmentation_filter = "properties._product_codes" 34 | assessment = QualityAssessment( 35 | os_client_args=self.OS_CLIENT_ARGS, 36 | rag_config=self.OS_CONFIG, 37 | GT_path="./part_lookups.json", 38 | custom_question_augmentation=custom_question_augmentation, 39 | question_augmentation_filter=question_augmentation_filter, 40 | ) 41 | evaluate = Evaluate(context, "5_sram_syca_openai_star_product_codes_20th", assessment) 42 | evaluate.run() 43 | print(evaluate.result) 44 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/integration/functions/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/functions/test_document.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import sycamore 4 | from sycamore.data import Element 5 | from sycamore.functions.document import split_and_convert_to_image 6 | from sycamore.transforms.partition import UnstructuredPdfPartitioner 7 | from sycamore.tests.config import TEST_DIR 8 | 9 | 10 | def test_split_and_convert_to_image_empty_page(): 11 | def _drop_page2(element: Element) -> Optional[Element]: 12 | if element.properties["page_number"] == 2: 13 | return None 14 | return element 15 | 16 | path = TEST_DIR / "resources/data/pdfs/Ray.pdf" 17 | 18 | context = sycamore.init() 19 | 20 | # Remove all elements from page 2, and make sure that page2 still shows up in the output. 21 | docs = ( 22 | context.read.binary(paths=[str(path)], binary_format="pdf") 23 | .partition(partitioner=UnstructuredPdfPartitioner()) 24 | .map_elements(_drop_page2) 25 | .flat_map(split_and_convert_to_image) 26 | .take_all() 27 | ) 28 | 29 | assert len(docs) == 17 30 | 31 | page2_candidate = [d for d in docs if d.properties["page_number"] == 2] 32 | 33 | assert len(page2_candidate) == 1 34 | assert len(page2_candidate[0].elements) == 0 35 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/query/test_planner.py: -------------------------------------------------------------------------------- 1 | from sycamore.connectors.opensearch.utils import OpenSearchClientWithLogging 2 | from sycamore.tests.integration.query.conftest import OS_CLIENT_ARGS, OS_CONFIG 3 | from sycamore.query.planner import LlmPlanner 4 | from sycamore.query.schema import OpenSearchSchema, OpenSearchSchemaField 5 | 6 | 7 | def test_simple_llm_planner(query_integration_test_index: str): 8 | """ 9 | Simple test ensuring nodes are being created and dependencies are being set. 10 | Using a simple query here for consistent query plans. 11 | """ 12 | os_client = OpenSearchClientWithLogging(OS_CLIENT_ARGS) 13 | 14 | schema = OpenSearchSchema( 15 | fields={ 16 | "location": OpenSearchSchemaField(field_type="string", examples=["New York", "Seattle"]), 17 | "airplaneType": OpenSearchSchemaField(field_type="string", examples=["Boeing 747", "Airbus A380"]), 18 | } 19 | ) 20 | planner = LlmPlanner(query_integration_test_index, data_schema=schema, os_config=OS_CONFIG, os_client=os_client) 21 | plan = planner.plan("How many locations did incidents happen in?") 22 | 23 | assert len(plan.nodes) == 2 24 | assert type(plan.nodes[0]).__name__ == "QueryDatabase" 25 | assert type(plan.nodes[1]).__name__ == "Count" 26 | 27 | assert [plan.nodes[0]] == plan.nodes[1].input_nodes() 28 | 29 | # Just ensure we can run the planner with a Schema object as well 30 | planner = LlmPlanner( 31 | query_integration_test_index, data_schema=schema.to_schema(), os_config=OS_CONFIG, os_client=os_client 32 | ) 33 | planner.plan("How many locations did incidents happen in?") 34 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/test_executor.py: -------------------------------------------------------------------------------- 1 | from sycamore.context import ExecMode 2 | import sycamore.tests.unit.test_executor as unit 3 | 4 | 5 | class TestPrepare(unit.TestPrepare): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.exec_mode = ExecMode.RAY 9 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/transforms/test_llm_filter.py: -------------------------------------------------------------------------------- 1 | import sycamore 2 | from sycamore.functions import OpenAITokenizer 3 | from sycamore.tests.config import TEST_DIR 4 | from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt 5 | from sycamore.llms.openai import OpenAI, OpenAIModels 6 | 7 | 8 | def test_llm_filter_ntsb_temp_q(): 9 | ctx = sycamore.init() 10 | ds = ctx.read.materialize(TEST_DIR / "resources/data/materialize/llmfilter-ntsb-temp") 11 | llm = OpenAI(OpenAIModels.GPT_4_1_MINI) 12 | max_tokens = 128_000 13 | openai_tokenizer = OpenAITokenizer(model_name="gpt-4o", max_tokens=max_tokens) 14 | prompt = LlmFilterMessagesJinjaPrompt.fork(filter_question="Is the temperature less than 60F?") 15 | ds_none = ds.llm_filter( 16 | llm=llm, 17 | new_field="_autogen_LLMFilterOutput", 18 | prompt=prompt, 19 | use_elements=True, 20 | max_tokens=max_tokens, 21 | tokenizer=openai_tokenizer, 22 | ) 23 | assert ds_none.count() == 0 24 | 25 | ds_one = ds.llm_filter( 26 | llm=llm, 27 | new_field="_autogen_LLMFilterOutput", 28 | prompt=prompt.fork(filter_question="Is the temperature greater than 60F?"), 29 | use_elements=True, 30 | max_tokens=max_tokens, 31 | tokenizer=openai_tokenizer, 32 | ) 33 | assert ds_one.count() == 1 34 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/transforms/test_random_sample.py: -------------------------------------------------------------------------------- 1 | import sycamore 2 | from sycamore.tests.unit.transforms.test_random_sample import TestRandomSample 3 | 4 | 5 | class TestRandomSampleRay(TestRandomSample): 6 | exec_mode = sycamore.EXEC_RAY 7 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/integration/transforms/test_sort.py: -------------------------------------------------------------------------------- 1 | from sycamore.context import ExecMode 2 | import sycamore.tests.unit.transforms.test_sort as unit 3 | 4 | 5 | class TestSort(unit.TestSort): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.exec_mode = ExecMode.RAY 9 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/pytest.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/pytest.ini -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/docx/aryn_website_sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/docx/aryn_website_sample.docx -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/imgs/sample-detr-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/imgs/sample-detr-image.png -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/json_writer/3fe9913e-60e2-11ef-90e5-e40d36f1e1ae.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/json_writer/3fe9913e-60e2-11ef-90e5-e40d36f1e1ae.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/json_writer/materialize.success: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/json_writer/materialize.success -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/json_writer/md-9e6e68ee-ad8e-4e39-a2e1-7ef5befc588c.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/json_writer/md-9e6e68ee-ad8e-4e39-a2e1-7ef5befc588c.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/doc-f-rhcfgmzrgifspjxjnl8vhh8.4fd48370db59b408b2700abd89bfe92e43009fde4ec216cfd112cdf17b7dfb35.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/doc-f-rhcfgmzrgifspjxjnl8vhh8.4fd48370db59b408b2700abd89bfe92e43009fde4ec216cfd112cdf17b7dfb35.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/materialize.clean: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/materialize.clean -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/materialize.success: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/materialize.success -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-07bgl12pc0intnh2y74po4c.a98b82d885005500fb664fb18283cf80864de0fb6116e5acd9c3060dd91086a7.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-07bgl12pc0intnh2y74po4c.a98b82d885005500fb664fb18283cf80864de0fb6116e5acd9c3060dd91086a7.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-6wi2aqr0b504zojkdqnyybf.97ae3a37a86700c4601f89551892eb79faeaa0c9afe66f26ee20d8b7f996f929.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-6wi2aqr0b504zojkdqnyybf.97ae3a37a86700c4601f89551892eb79faeaa0c9afe66f26ee20d8b7f996f929.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-75qp993ysz87aa1c1tsc5o0.0db762bad4ffb81bdde0f754087a65594f0cc8001986729f6ded86fa7ca7803e.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-75qp993ysz87aa1c1tsc5o0.0db762bad4ffb81bdde0f754087a65594f0cc8001986729f6ded86fa7ca7803e.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-89ww3vta189zsw6ac4vsjem.253ac204a7d81cc730b4a90202afaf3f78dbf27c87a17bcb94a23212b159cce6.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-89ww3vta189zsw6ac4vsjem.253ac204a7d81cc730b4a90202afaf3f78dbf27c87a17bcb94a23212b159cce6.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-a8pg30d1wqbff3zsf6c4l0w.c75fd5cba213cc7888ffe30f52d9fb7b637acd227a728b07ee7e1c50545c884e.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-a8pg30d1wqbff3zsf6c4l0w.c75fd5cba213cc7888ffe30f52d9fb7b637acd227a728b07ee7e1c50545c884e.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-ax65fkm5dy3cmkxtcfu6dv6.219182e0864996b3bb9855bd027940ff5989729f1e3ae80db8b2574f64eceb89.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-ax65fkm5dy3cmkxtcfu6dv6.219182e0864996b3bb9855bd027940ff5989729f1e3ae80db8b2574f64eceb89.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-bx7u8xo31r49lnxi7r3thdr.7d4ab219ffa3c8c219fdcef0f384cac80e8e692a5669da4712f9076489aade58.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-bx7u8xo31r49lnxi7r3thdr.7d4ab219ffa3c8c219fdcef0f384cac80e8e692a5669da4712f9076489aade58.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-g9dk50t6tir9sxomqvxpwhh.6f10891f3f44b8376061256d2a3d56b91945e3082abec53a5e8f1d3394492a37.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-g9dk50t6tir9sxomqvxpwhh.6f10891f3f44b8376061256d2a3d56b91945e3082abec53a5e8f1d3394492a37.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-k85dhbsu6n4rtp1vd62vkik.ef6d0f2992ef663296ab39f158b0a648f8bb9f4b679cfffc7df262ce49ef61e8.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-k85dhbsu6n4rtp1vd62vkik.ef6d0f2992ef663296ab39f158b0a648f8bb9f4b679cfffc7df262ce49ef61e8.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-kl9t1to3b6t73hhclt9ke6q.26bb7d98dbcd581fa0873ea58b4cca247691b0aff527886dd69c56c407e4de6e.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-kl9t1to3b6t73hhclt9ke6q.26bb7d98dbcd581fa0873ea58b4cca247691b0aff527886dd69c56c407e4de6e.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-m3vkixw9pdmcbrfs3g6hhhf.2cb8ce11ce5feed99eacd754f8279868824d3266a59feaf92325a33a305bedc0.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-m3vkixw9pdmcbrfs3g6hhhf.2cb8ce11ce5feed99eacd754f8279868824d3266a59feaf92325a33a305bedc0.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-ml5ks4t608vrlz00gjk4fum.8d4b49675fa3c7236c71c93dc2fafd1eb95690c74a18d1594f588e953366887a.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-ml5ks4t608vrlz00gjk4fum.8d4b49675fa3c7236c71c93dc2fafd1eb95690c74a18d1594f588e953366887a.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-n3oawwwjt1hxatavnpjlbjb.7b9fa911fb9108ed58b8f80ce54106f0c4e98c87299a4df6519e0e9ebf142b07.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-n3oawwwjt1hxatavnpjlbjb.7b9fa911fb9108ed58b8f80ce54106f0c4e98c87299a4df6519e0e9ebf142b07.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-poffmrto89o7t642owz0uqd.d864e9972c347cbc8900c223ff27a67815bf4cff87e812130969e73ae840ced1.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-poffmrto89o7t642owz0uqd.d864e9972c347cbc8900c223ff27a67815bf4cff87e812130969e73ae840ced1.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-qasv594a8qoyh0pwlmhljih.f3dc469668aade55c348445e4cbdfe9cec1c205e24506ee4cfc1124bad05dabb.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-qasv594a8qoyh0pwlmhljih.f3dc469668aade55c348445e4cbdfe9cec1c205e24506ee4cfc1124bad05dabb.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-rve5tvzjsb8qh6hcwdiq6qm.1022022f6d1712c26c86b3d4bc9152aeebc39806b9ea36285b5f3135cc53d672.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-rve5tvzjsb8qh6hcwdiq6qm.1022022f6d1712c26c86b3d4bc9152aeebc39806b9ea36285b5f3135cc53d672.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-swwvf71e2zvm5due6ongmjx.db0edaa58fcb926c43f191eca5b35e24721662924f3a3f28e544c49ed06c0fc4.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-swwvf71e2zvm5due6ongmjx.db0edaa58fcb926c43f191eca5b35e24721662924f3a3f28e544c49ed06c0fc4.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-u2q5b6b3k1liz40fdy8ibi9.a1761cb4fab284b222950d77e6deda85b44a607bcc588b9f38bff80d97b2a6dc.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-u2q5b6b3k1liz40fdy8ibi9.a1761cb4fab284b222950d77e6deda85b44a607bcc588b9f38bff80d97b2a6dc.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-utp720v7v9ufcvce5tro2su.921a82b7ea58fbda1f36ce3aa828ef96efb10836a04560a6e4739b3182db4820.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-utp720v7v9ufcvce5tro2su.921a82b7ea58fbda1f36ce3aa828ef96efb10836a04560a6e4739b3182db4820.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-xjnfbj6qqys50k9pi988awd.fb4d50848f84e0f145c1d091aee688dbae19b8ef420ac7b4158042785fe24578.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-xjnfbj6qqys50k9pi988awd.fb4d50848f84e0f145c1d091aee688dbae19b8ef420ac7b4158042785fe24578.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-zd8xb8wbpv6nit9wmujsjpi.d24c185d792566b7731a2d8aad7dc5cca24c4a1ac9fcd814f96fa9d7d0087a53.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/materialize/llmfilter-ntsb-temp/md-d-zd8xb8wbpv6nit9wmujsjpi.d24c185d792566b7731a2d8aad7dc5cca24c4a1ac9fcd814f96fa9d7d0087a53.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/ocr_pdfs/test_simple_ocr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/ocr_pdfs/test_simple_ocr.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/Ray.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/Ray.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/Ray_page1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/Ray_page1.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/Ray_page11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/Ray_page11.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/Transformer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/Transformer.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/basic_table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/basic_table.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/doctor_testimonial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/doctor_testimonial.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb-report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb-report.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb0.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb1.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/ntsb3.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pdfs/visit_aryn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pdfs/visit_aryn.pdf -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/data/pptx/design.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/data/pptx/design.pptx -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/resources/objects/weaviate/collection_params_b.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/resources/objects/weaviate/collection_params_b.pickle -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/connectors/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/connectors/common/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/data/test_bbox.py: -------------------------------------------------------------------------------- 1 | from sycamore.data import BoundingBox 2 | from sycamore.data.bbox import EMPTY_BBOX 3 | from math import isclose 4 | 5 | # Test bounding boxes. Box 1 and 2 intersect. Box 3 is intersects with Box 1 but not Box 2. Box 4 and 5 6 | # are disjoint from all others and are empty. 7 | bbox1 = BoundingBox(20.0, 20.0, 60.0, 60.0) 8 | bbox2 = BoundingBox(10.0, 40.0, 40.0, 80.0) 9 | bbox3 = BoundingBox(50.0, 10.0, 90.0, 50.0) 10 | bbox4 = BoundingBox(100.0, 100.0, 100.0, 120.0) 11 | bbox5 = BoundingBox(100.0, 100.0, 100.0, 100.0) 12 | 13 | 14 | def test_intersect(): 15 | assert isclose(bbox1.intersect(bbox2).area, 400.0) 16 | assert isclose(bbox2.intersect(bbox1).area, 400.0) 17 | 18 | assert isclose(bbox1.intersect(bbox3).area, 300.0) 19 | assert isclose(bbox2.intersect(bbox3).area, 0.0) 20 | 21 | 22 | def test_union(): 23 | assert isclose(bbox1.union(bbox1).area, bbox1.area) 24 | assert isclose(bbox1.union(bbox2).area, 3000.0) 25 | 26 | assert isclose(EMPTY_BBOX.union(bbox1).area, bbox1.area) 27 | assert isclose(bbox1.union(EMPTY_BBOX).area, bbox1.area) 28 | 29 | 30 | def test_absolute_relative(): 31 | assert bbox1.area == bbox1.to_relative(100, 100).to_absolute(100, 100).area 32 | assert isclose(bbox1.to_relative(100, 100).area, 0.16) 33 | 34 | 35 | def test_empty_bbox(): 36 | assert bbox5.is_empty() 37 | assert bbox4.is_empty() 38 | assert bbox4.area == 0 and bbox5.area == 0 39 | assert bbox4.union(bbox5).is_empty() and bbox4.union(bbox5).area == 0 40 | assert bbox4.intersect(bbox5).is_empty() and bbox4.intersect(bbox5) == EMPTY_BBOX 41 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/data/test_element.py: -------------------------------------------------------------------------------- 1 | from sycamore.data.element import create_element, Element, ImageElement, TableElement 2 | from sycamore.data.table import Table, TableCell 3 | 4 | 5 | def test_create_element_bad_type(): 6 | e = create_element(type=None) 7 | assert isinstance(e, Element) 8 | 9 | e = create_element(type={}) 10 | assert isinstance(e, Element) 11 | 12 | e = create_element(type="iMaGE") 13 | assert isinstance(e, ImageElement) 14 | 15 | 16 | def test_field_to_value_table(): 17 | table = Table( 18 | [ 19 | TableCell(content="head1", rows=[0], cols=[0], is_header=True), 20 | TableCell(content="head2", rows=[0], cols=[1], is_header=True), 21 | TableCell(content="3", rows=[1], cols=[0], is_header=False), 22 | TableCell(content="4", rows=[1], cols=[1], is_header=False), 23 | ] 24 | ) 25 | 26 | elem = create_element(type="table", table=table, properties={"parent": {"child1": 1, "child2": 2}}) 27 | assert isinstance(elem, TableElement) 28 | 29 | assert elem.field_to_value("properties.parent.child1") == 1 30 | assert elem.field_to_value("properties.parent.child2") == 2 31 | assert elem.field_to_value("properties.parent.child3") is None 32 | 33 | assert elem.field_to_value("text_representation") == "head1,head2\n3,4\n" 34 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/evaluation/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/functions/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/functions/test_field_to_value.py: -------------------------------------------------------------------------------- 1 | from sycamore.data.document import Document 2 | 3 | 4 | class TestFieldToValue: 5 | def test_field_to_value(self): 6 | doc = Document( 7 | text_representation="hello", 8 | doc_id=1, 9 | properties={"letter": "A", "animal": "panda", "math": {"pi": 3.14, "e": 2.72, "tanx": "sinx/cosx"}}, 10 | ) 11 | 12 | assert doc.field_to_value("text_representation") == "hello" 13 | assert doc.field_to_value("doc_id") == 1 14 | assert doc.field_to_value("properties.letter") == "A" 15 | assert doc.field_to_value("properties.animal") == "panda" 16 | assert doc.field_to_value("properties.math.pi") == 3.14 17 | assert doc.field_to_value("properties.math.e") == 2.72 18 | assert doc.field_to_value("properties.math.tanx") == "sinx/cosx" 19 | 20 | assert doc.field_to_value("properties.math.log") is None 21 | assert doc.field_to_value("document_id") is None 22 | assert doc.field_to_value("text_representation.text") is None 23 | assert doc.field_to_value("document_id.text") is None 24 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/functions/test_rabin_karp.py: -------------------------------------------------------------------------------- 1 | from sycamore.functions.rabin_karp import RkHash, RkWindow 2 | 3 | 4 | class TestRabinKarp: 5 | def test_smoke(self): 6 | aa = RkHash(3) 7 | aa.hashIn(101) 8 | aa.hashIn(102) 9 | aa.hashIn(103) 10 | 11 | bb = RkHash(3) 12 | bb.hashIn(100) 13 | bb.hashIn(101) 14 | bb.hashIn(102) 15 | bb.hashOut(100) 16 | bb.hashIn(103) 17 | 18 | assert aa.get() == bb.get() 19 | 20 | def test_larger(self): 21 | aa = RkHash(32) 22 | for ch in range(101, 133): 23 | aa.hashIn(ch) 24 | 25 | bb = RkHash(32) 26 | for ch in range(100, 132): 27 | bb.hashIn(ch) 28 | bb.hashOutIn(100, 132) 29 | 30 | assert aa.get() == bb.get() 31 | 32 | def test_window(self): 33 | aa = RkHash(32) 34 | for ch in range(101, 133): 35 | aa.hashIn(ch) 36 | 37 | ww = RkWindow(32) 38 | for ch in range(100, 133): 39 | ww.hash(ch) 40 | 41 | assert aa.get() == ww.get() 42 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/functions/test_text_chunker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sycamore.functions import TextOverlapChunker 4 | 5 | 6 | class TestTokenOverlapChunker: 7 | @pytest.mark.parametrize( 8 | "chunker, tokens, expected_chunks", 9 | [ 10 | ( 11 | TextOverlapChunker(chunk_token_count=2, chunk_overlap_token_count=1), 12 | ["a", "b", "c", "d", "e"], 13 | [["a", "b"], ["b", "c"], ["c", "d"], ["d", "e"], ["e"]], 14 | ), 15 | ( 16 | TextOverlapChunker(chunk_token_count=2, chunk_overlap_token_count=0), 17 | ["a", "b", "c", "d", "e"], 18 | [["a", "b"], ["c", "d"], ["e"]], 19 | ), 20 | ], 21 | ) 22 | def test_token_overlap_chunker(self, chunker, tokens, expected_chunks): 23 | chunks = chunker.chunk(tokens) 24 | assert chunks == expected_chunks 25 | 26 | def test_token_overlap_is_greater_than_chunk_size(self): 27 | with pytest.raises(Exception) as exception: 28 | TextOverlapChunker(chunk_token_count=2, chunk_overlap_token_count=2) 29 | assert str(exception.value) == "Token overlap count between chunks must be lesser than chunk token count" 30 | 31 | with pytest.raises(Exception) as exception: 32 | TextOverlapChunker(chunk_token_count=2, chunk_overlap_token_count=3) 33 | assert str(exception.value) == "Token overlap count between chunks must be lesser than chunk token count" 34 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/functions/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sycamore.functions import CharacterTokenizer 4 | 5 | 6 | class TestTokenOverlapChunker: 7 | @pytest.mark.parametrize( 8 | "tokenizer, text, expected_tokens", 9 | [(CharacterTokenizer(), "a test", ["a", " ", "t", "e", "s", "t"]), (CharacterTokenizer(), "", [])], 10 | ) 11 | def test_character_tokenizer(self, tokenizer, text, expected_tokens): 12 | tokens = tokenizer.tokenize(text) 13 | assert tokens == expected_tokens 14 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/query/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/query/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/query/execution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/query/execution/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/scans/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/scans/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/scans/test_materialized_scan.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas import DataFrame 3 | from pyarrow import Table 4 | 5 | from sycamore.connectors.file import ArrowScan, DocScan, PandasScan 6 | from sycamore.data import Document 7 | 8 | 9 | class TestMaterializedScan: 10 | dicts = [{"doc_id": 1, "type": "hello, world!"}, {"doc_id": 2, "type": "你好,世界!"}] 11 | 12 | @pytest.mark.parametrize( 13 | "scanner", 14 | [ArrowScan(Table.from_pylist(dicts)), DocScan([Document(d) for d in dicts]), PandasScan(DataFrame(dicts))], 15 | ) 16 | def test_materialized_scan(self, scanner): 17 | ds = scanner.execute() 18 | assert ds.schema().names == ["doc"] 19 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/test_grouped_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import sycamore 4 | from sycamore import DocSet 5 | from sycamore.data import Document 6 | 7 | 8 | class TestGroup: 9 | @pytest.fixture 10 | def fruits_docset(self) -> DocSet: 11 | doc_list = [ 12 | Document(text_representation="apple", parent_id=8, properties={"name": "A"}), 13 | Document(text_representation="banana", parent_id=7, properties={"name": "B"}), 14 | Document(text_representation="apple", parent_id=8, properties={"name": "C"}), 15 | Document(text_representation="banana", parent_id=7, properties={"name": "D"}), 16 | Document(text_representation="cherry", parent_id=6, properties={"name": "E"}), 17 | Document(text_representation="apple", parent_id=9, properties={"name": "F"}), 18 | ] 19 | context = sycamore.init() 20 | return context.read.document(doc_list) 21 | 22 | def test_groupby_count(self, fruits_docset): 23 | aggregated = fruits_docset.groupby("text_representation").count() 24 | assert aggregated.count() == 3 25 | 26 | def test_groupby_collect(self, fruits_docset): 27 | aggregated = fruits_docset.groupby("text_representation", entity="properties.name").collect() 28 | assert aggregated.count() == 3 29 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/test_rewriter.py: -------------------------------------------------------------------------------- 1 | from sycamore.rules import EnforceResourceUsage 2 | from sycamore.connectors.file import BinaryScan 3 | from sycamore.transforms import Partition, Explode 4 | from sycamore.transforms.partition import UnstructuredPdfPartitioner 5 | from sycamore.connectors.opensearch import OpenSearchWriterClientParams, OpenSearchWriterTargetParams, OpenSearchWriter 6 | 7 | 8 | class TestRewriter: 9 | def test_enforce_resource_usage(self): 10 | scan = BinaryScan("path", binary_format="pdf") 11 | partition = Partition(scan, UnstructuredPdfPartitioner()) 12 | explode = Explode(partition) 13 | writer = OpenSearchWriter( 14 | explode, OpenSearchWriterClientParams(), OpenSearchWriterTargetParams(index_name="test") 15 | ) 16 | 17 | rule = EnforceResourceUsage() 18 | writer.traverse_down(rule) 19 | assert scan.resource_args["num_cpus"] == 1 and "num_gpus" not in scan.resource_args 20 | assert explode.resource_args["num_cpus"] == 1 and "num_gpus" not in explode.resource_args 21 | assert writer.resource_args["num_cpus"] == 1 and "num_gpus" not in writer.resource_args 22 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/transforms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/lib/sycamore/sycamore/tests/unit/transforms/__init__.py -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/transforms/check_partition_impl.py: -------------------------------------------------------------------------------- 1 | from sycamore.data.bbox import BoundingBox 2 | from sycamore.utils.cache import Cache 3 | 4 | 5 | def check_partition(partitioner, path, **kwargs): 6 | with open(path, "rb") as f: 7 | hash_key = Cache.get_hash_context(f.read()).hexdigest() 8 | batched = partitioner._partition_pdf_batched_named(path, hash_key, **kwargs) 9 | assert batched is not None 10 | return batched 11 | 12 | 13 | def check_table_extraction(partitioner, path, **kwargs): 14 | with open(path, "rb") as f: 15 | hash_key = Cache.get_hash_context(f.read()).hexdigest() 16 | batched = partitioner._partition_pdf_batched_named(path, hash_key, **kwargs) 17 | assert all( 18 | ( 19 | d.tokens is not None 20 | and all("bbox" in token and isinstance(token["bbox"], BoundingBox) for token in d.tokens) 21 | if d.type == "table" 22 | else True 23 | ) 24 | for batched_list in batched 25 | for d in batched_list 26 | ) 27 | return batched 28 | 29 | 30 | if __name__ == "__main__": 31 | import sys 32 | from sycamore.transforms.detr_partitioner import ArynPDFPartitioner 33 | 34 | assert len(sys.argv) == 2, "Usage: cmd " 35 | s = ArynPDFPartitioner("Aryn/deformable-detr-DocLayNet") 36 | print(f"Comparing processing of {sys.argv[1]}") 37 | p = check_partition(s, sys.argv[1]) 38 | print(f"Compared {len(p)} pages") 39 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/transforms/test_augment_text.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import textwrap 3 | from sycamore.data import Document 4 | from sycamore.transforms.augment_text import UDFTextAugmentor, JinjaTextAugmentor 5 | 6 | 7 | class TestAugmentText: 8 | doc = Document( 9 | { 10 | "doc_id": "doc_id", 11 | "type": "pdf", 12 | "text_representation": "text", 13 | "properties": {"path": "/docs/foo.txt", "title": "bar"}, 14 | } 15 | ) 16 | 17 | def test_udf_augmentation(self): 18 | def f(doc: Document) -> str: 19 | if doc.doc_id == "doc_id": 20 | return "doc_id" 21 | else: 22 | return "not doc id" 23 | 24 | aug = UDFTextAugmentor(f) 25 | text = aug.augment_text(self.doc) 26 | assert text == "doc_id" 27 | text2 = aug.augment_text(Document()) 28 | assert text2 == "not doc id" 29 | 30 | def test_jinja_augmentation(self): 31 | template = textwrap.dedent( 32 | """\ 33 | {% if doc.properties['path'] %}path: {{ pathlib.Path(doc.properties['path']).name }}.{% endif %} 34 | {% if doc.properties['title'] %}Title: {{ doc.properties['title'] }}.{% endif %} 35 | {% if doc.text_representation %}{{ doc.text_representation }}{% endif %}""" 36 | ) 37 | aug = JinjaTextAugmentor(template=template, modules={"pathlib": pathlib}) 38 | text = aug.augment_text(self.doc) 39 | print(text) 40 | assert text == "path: foo.txt.\nTitle: bar.\ntext" 41 | text2 = aug.augment_text(Document()) 42 | assert text2 == "\n\n" 43 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/transforms/test_random_sample.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import pytest 4 | 5 | import sycamore 6 | from sycamore import DocSet 7 | from sycamore.data import Document 8 | 9 | 10 | class TestRandomSample: 11 | exec_mode = sycamore.EXEC_LOCAL 12 | 13 | @pytest.fixture() 14 | def docs(self) -> list[Document]: 15 | print("Generating docs") 16 | return [ 17 | Document(text_representation=f"Document {i}", doc_id=i, properties={"document_number": i}) 18 | for i in range(100) 19 | ] 20 | 21 | @pytest.fixture() 22 | def docset(self, docs: list[Document]) -> DocSet: 23 | context = sycamore.init(exec_mode=self.exec_mode) 24 | return context.read.document(docs) 25 | 26 | def test_empty_sample(self, docset: DocSet): 27 | assert docset.random_sample(0).count() == 0 28 | 29 | def test_complete_sample(self, docset: DocSet): 30 | assert docset.random_sample(1).count() == 100 31 | 32 | def test_random_sample(self, docset: DocSet): 33 | actual = docset.random_sample(0.5).count() 34 | math.isclose(actual, 50, rel_tol=2, abs_tol=2) 35 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/transforms/text_extraction/test_ocr_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from sycamore.transforms.text_extraction.ocr_models import EasyOcr 4 | from unittest.mock import patch 5 | 6 | 7 | @patch.dict(os.environ, {"ARYN_AIRGAPPED": "true"}) 8 | def test_air_gap(): 9 | if os.path.exists("/app/models") or os.path.exists("/aryn/models"): 10 | print("One of /app/models and /aryn/models exists; unable to test airgapping") 11 | return 12 | 13 | # fails because the model doesn't exist. 14 | with pytest.raises(AssertionError): 15 | EasyOcr() 16 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_deep_eq.py: -------------------------------------------------------------------------------- 1 | from sycamore.utils.deep_eq import deep_eq 2 | 3 | 4 | def test_deep_eq(): 5 | class Tmp: 6 | def __init__(self, x, y): 7 | self.x = x 8 | self.y = y 9 | 10 | def doit(): 11 | pass 12 | 13 | assert deep_eq(Tmp(1, 1), Tmp(1, 1)) 14 | assert not deep_eq(Tmp(1, 1), Tmp(1, 2)) 15 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_extract_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | from sycamore.utils.extract_json import extract_json 5 | 6 | 7 | def test_perfect(): 8 | want = {"a": 5, "b": {"c": "y"}} 9 | assert extract_json(json.dumps(want)) == want 10 | 11 | 12 | def test_none(): 13 | want = {"None": None} 14 | input = '{ "None": None }' 15 | assert extract_json(input) == want 16 | 17 | 18 | def test_bad_escape(): 19 | want = "\xff" 20 | input = '"\xff"' # json.loads("\xFF") -> error; escaping is \uHHHH 21 | assert extract_json(input) == want 22 | 23 | 24 | def test_code_block(): 25 | want = {"a": 5, "x": "y"} 26 | # Note extract_json does not tolerate any leading whitespace 27 | input = """```json 28 | { "a": 5, "x": "y" } 29 | ``` 30 | """ 31 | assert extract_json(input) == want 32 | 33 | 34 | def test_fails(): 35 | with pytest.raises(ValueError): 36 | extract_json("1-2") 37 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_fileformat_tools.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | from io import BytesIO 5 | 6 | from pypdf import PdfReader 7 | 8 | import sycamore 9 | from sycamore.tests.config import TEST_DIR 10 | from sycamore.utils.fileformat_tools import binary_representation_to_pdf, get_file_extension 11 | 12 | 13 | def test_binary_representation_to_pdf(): 14 | # Run this test locally only if libreoffice is installed 15 | if shutil.which("libreoffice") is None: 16 | assert "GITHUB_ACTIONS" not in os.environ 17 | logging.warning("Skipping test ...; /usr/bin/libreoffice is not installed") 18 | return 19 | paths = str(TEST_DIR / "resources/data/docx/aryn_website_sample.docx") 20 | 21 | context = sycamore.init() 22 | doc = context.read.binary(paths, binary_format="docx").take(1)[0] 23 | result = binary_representation_to_pdf(doc) 24 | 25 | pdf_bytes = BytesIO(result.binary_representation) 26 | reader = PdfReader(pdf_bytes) 27 | assert len(reader.pages) == 2 28 | 29 | 30 | def test_get_file_extension(): 31 | data = { 32 | "file:///tmp/filename.txt": ".txt", 33 | "filename.docx": ".docx", 34 | "local/dir/filename.doc": ".doc", 35 | "s3://bucket/prefix/filename.xml": ".xml", 36 | "/home/ec2-user/random_file.some_extension": ".some_extension", 37 | "/home/ec2-user/random_file": "", 38 | "unknown": "", 39 | } 40 | for k, v in data.items(): 41 | assert get_file_extension(k) == v 42 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_import_utils.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import assert_type 2 | from sycamore.utils.import_utils import requires_modules 3 | 4 | 5 | # The actual library doesn't matter here. 6 | @requires_modules("apted", extra="eval") 7 | def require_fn() -> int: 8 | return 42 9 | 10 | 11 | # This test fails prior to adding generic (ParamSpec and TypeVar) type annotations to the 12 | # requires_modules decorator, as the revealed type is "Any". 13 | def test_mypy_type() -> None: 14 | res = require_fn() 15 | assert_type(res, int) # type: ignore[assert-type] 16 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_jupyter.py: -------------------------------------------------------------------------------- 1 | from sycamore.utils.jupyter import LocalFileViewer 2 | import socket 3 | import time 4 | import gc 5 | 6 | 7 | def try_start_server(port: int, xfail: bool): 8 | if xfail: 9 | conflicted = False 10 | for _ in range(100): 11 | time.sleep(0.1) 12 | try: 13 | socket.create_server(("localhost", port)).close() 14 | except OSError: 15 | conflicted = True 16 | break 17 | assert conflicted 18 | else: 19 | # This branch is only really meaningful if called after 20 | # xfail=True, to test that stuff was cleaned up properly. 21 | time.sleep(0.1) 22 | socket.create_server(("localhost", port)).close() 23 | 24 | 25 | def test_localfileviewer_no_leaks(): 26 | port = 2647 27 | viewpdf = LocalFileViewer(port=port) 28 | try_start_server(port, xfail=True) 29 | 30 | newport = 2648 31 | viewpdf = LocalFileViewer(port=newport) 32 | # gc the original viewpdf 33 | gc.collect() 34 | try_start_server(port, xfail=False) 35 | try_start_server(newport, xfail=True) 36 | 37 | del viewpdf 38 | gc.collect() 39 | try_start_server(newport, xfail=False) 40 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_nested.py: -------------------------------------------------------------------------------- 1 | from sycamore.utils.nested import nested_lookup, dotted_lookup 2 | 3 | 4 | def test_nested_lookup(): 5 | v = {"a": {"b": 1, "c": 2, "": 5}, "d": {}} 6 | assert nested_lookup(v, ["a", "b"]) == 1 7 | assert nested_lookup(v, ["a", "b", "c"]) is None 8 | assert nested_lookup(v, ["a", "c"]) == 2 9 | assert nested_lookup(v, ["a", ""]) == 5 10 | assert nested_lookup(v, ["a", "d"]) is None 11 | assert nested_lookup(v, ["d"]) == {} 12 | assert nested_lookup(v, ["d", 1]) is None 13 | assert nested_lookup(v, [3]) is None 14 | 15 | 16 | def test_dotted_lookup(): 17 | v = {"a": {"b": 1, "c": 2, "": 5}, "d": {}} 18 | assert dotted_lookup(v, "a.b") == 1 19 | assert dotted_lookup(v, "a.b.c") is None 20 | assert dotted_lookup(v, "a.c") == 2 21 | assert dotted_lookup(v, "a.") == 5 22 | assert dotted_lookup(v, "a.d") is None 23 | assert dotted_lookup(v, "d") == {} 24 | assert dotted_lookup(v, "d.1") is None 25 | assert dotted_lookup(v, "3") is None 26 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_pydantic_pickling.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Optional 3 | from sycamore.utils.pickle_pydantic import safe_cloudpickle, safe_cloudunpickle 4 | from pydantic import BaseModel, Field 5 | 6 | 7 | def test_pydantic_picklng() -> None: 8 | class BoardMember(BaseModel): 9 | name: str 10 | votes_for: Optional[int] 11 | votes_against: Optional[int] 12 | votes_abstentions: Optional[int] 13 | 14 | class Company(BaseModel): 15 | name: str 16 | founded: Optional[str] = Field(description="The date a company was founded") 17 | 18 | company_pickled = safe_cloudpickle(Company) 19 | board_member_pickled = safe_cloudpickle(BoardMember) 20 | 21 | company_unpickled = safe_cloudunpickle(company_pickled) 22 | board_member_unpickled = safe_cloudunpickle(board_member_pickled) 23 | 24 | assert sys.getsizeof(company_unpickled) == sys.getsizeof(Company) 25 | assert sys.getsizeof(board_member_unpickled) == sys.getsizeof(BoardMember) 26 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_ray_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sycamore.utils.ray_utils import check_serializable, handle_serialization_exception 4 | 5 | 6 | def test_non_serializable(): 7 | import threading 8 | 9 | lock1 = threading.Lock() 10 | lock2 = threading.Lock() 11 | with pytest.raises(ValueError): 12 | # Make sure this works with passing multiple objects. 13 | check_serializable(lock1, lock2) 14 | 15 | 16 | def test_serializable(): 17 | check_serializable("a") 18 | 19 | 20 | def test_decorator_non_serializable(): 21 | import threading 22 | 23 | class Dummy: 24 | def __init__(self): 25 | self.lock1 = threading.Lock() 26 | self.lock2 = threading.Lock() 27 | 28 | @handle_serialization_exception("lock1", "lock2") 29 | def test_func(self): 30 | raise TypeError("Not serializable") 31 | 32 | with pytest.raises(ValueError) as error_info: 33 | dummy = Dummy() 34 | dummy.test_func() 35 | 36 | print(error_info.value) 37 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_similarity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import Mock 3 | from sycamore.utils.similarity import make_element_sorter_fn 4 | 5 | 6 | def test_make_element_sorter_fn_no_similarity_query(): 7 | sorter_fn = make_element_sorter_fn("test_field", None, Mock()) 8 | assert sorter_fn({}) is None 9 | 10 | 11 | def test_make_element_sorter_fn_no_similarity_scorer(): 12 | with pytest.raises(AssertionError, match="Similarity sorting requires a scorer"): 13 | make_element_sorter_fn("test_field", "query", None) 14 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_sycamore_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from sycamore.utils.sycamore_logger import RateLimitLogger 5 | 6 | 7 | def test_logger_ratelimit(caplog): 8 | logger = logging.getLogger("test_sycamore") 9 | 10 | with caplog.at_level(logging.INFO): 11 | for i in range(5): 12 | logger.info(f"Unbounded {i}") 13 | 14 | logger.addFilter(RateLimitLogger()) 15 | for i in range(5): 16 | logger.info(f"Bounded {i}") 17 | 18 | time.sleep(1) 19 | logger.info("Bounded After") 20 | 21 | for i in range(5): 22 | assert f"Unbounded {i}\n" in caplog.text 23 | 24 | assert "Bounded 0" in caplog.text 25 | for i in range(1, 5): 26 | assert f"Bounded {i}\n" not in caplog.text 27 | 28 | assert "Bounded After\n" in caplog.text 29 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/tests/unit/utils/test_time_trace.py: -------------------------------------------------------------------------------- 1 | from sycamore.utils.time_trace import LogTime, TimeTrace 2 | import time 3 | import os 4 | import tempfile 5 | 6 | import pytest 7 | 8 | 9 | class TestTimeTrace: 10 | @pytest.fixture(autouse=True) 11 | def set_env(self): 12 | with tempfile.TemporaryDirectory() as tmp: 13 | os.environ["TIMETRACE"] = f"{tmp}/tt" 14 | yield 15 | 16 | def test_with(self): 17 | with TimeTrace("test_with"): 18 | time.sleep(0.01) 19 | 20 | def test_start_end(self): 21 | tt = TimeTrace("test_start_end") 22 | tt.start() 23 | time.sleep(0.01) 24 | tt.end() 25 | 26 | 27 | class TestLogTime: 28 | def test_simple(self): 29 | a = LogTime("simple") 30 | a.start() 31 | time.sleep(1) 32 | d = a.measure() 33 | assert d.wall_s() >= 1 34 | assert d.user_s() <= 0.1 35 | assert d.sys_s() <= 0.1 36 | 37 | def test_point(self): 38 | # verify lack of crashing 39 | LogTime("point", point=True) 40 | 41 | def test_with_start(self): 42 | # verify lack of crashing 43 | with LogTime("start", log_start=True): 44 | pass 45 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/transforms/dataset_scan.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | from sycamore.plan_nodes import Scan 3 | 4 | if TYPE_CHECKING: 5 | from ray.data import Dataset 6 | 7 | 8 | class DatasetScan(Scan): 9 | """ 10 | Scans a dataset. 11 | """ 12 | 13 | def __init__(self, dataset: "Dataset", **resource_args): 14 | super().__init__(**resource_args) 15 | self._dataset = dataset 16 | 17 | def execute(self, **kwargs) -> "Dataset": 18 | return self._dataset 19 | 20 | def format(self): 21 | return "dataset" 22 | 23 | def __str__(self): 24 | return f"DatasetScan({self._dataset})" 25 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/transforms/detr_partitioner_config.py: -------------------------------------------------------------------------------- 1 | ARYN_DETR_MODEL = "Aryn/deformable-detr-DocLayNet" 2 | DEFAULT_ARYN_PARTITIONER_ADDRESS = "https://api.aryn.cloud/v1/document/partition" 3 | DEFAULT_LOCAL_THRESHOLD = 0.35 4 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/transforms/random_sample.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Optional, TYPE_CHECKING 3 | 4 | from sycamore.plan_nodes import Node, Transform 5 | from sycamore.data import Document 6 | 7 | if TYPE_CHECKING: 8 | from ray.data import Dataset 9 | 10 | 11 | class RandomSample(Transform): 12 | """ 13 | Generates a random sample of documents in a collection. 14 | 15 | Args: 16 | child: The plan node providing the dataset. 17 | fraction: The fraction of documents to retain. 18 | seed: The seed to use to initialize the RNG. 19 | resource_args: Additional resource-related arguments to pass to the execution env. 20 | """ 21 | 22 | def __init__(self, child: Node, fraction: float, seed: Optional[int] = None, **resource_args): 23 | super().__init__(child, **resource_args) 24 | self.fraction = fraction 25 | self.seed = seed 26 | 27 | def execute(self, **kwargs) -> "Dataset": 28 | dataset = self.child().execute() 29 | return dataset.random_sample(self.fraction, seed=self.seed) 30 | 31 | def local_execute(self, all_docs: list[Document]) -> list[Document]: 32 | if self.seed is not None: 33 | random.seed(self.seed) 34 | return random.sample(all_docs, int(len(all_docs) * self.fraction)) 35 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/transforms/text_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from sycamore.transforms.text_extraction.ocr_models import OcrModel, PaddleOcr, LegacyOcr, Tesseract, EasyOcr 2 | from sycamore.transforms.text_extraction.pdf_miner import PdfMinerExtractor 3 | from sycamore.transforms.text_extraction.text_extractor import TextExtractor 4 | 5 | EXTRACTOR_DICT = { 6 | "paddle": PaddleOcr, 7 | "legacy": LegacyOcr, 8 | "tesseract": Tesseract, 9 | "easyocr": EasyOcr, 10 | "pdfminer": PdfMinerExtractor, 11 | } 12 | 13 | 14 | def get_text_extractor(extractor_type: str, **kwargs) -> TextExtractor: 15 | if extractor_type not in EXTRACTOR_DICT: 16 | raise ValueError(f"Invalid TextExtractor type {extractor_type}") 17 | return EXTRACTOR_DICT[extractor_type](**kwargs) 18 | 19 | 20 | __all__ = [ 21 | "PaddleOcr", 22 | "LegacyOcr", 23 | "Tesseract", 24 | "EasyOcr", 25 | "PdfMinerExtractor", 26 | "OcrModel", 27 | "TextExtractor", 28 | ] 29 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/transforms/text_extraction/text_extractor.py: -------------------------------------------------------------------------------- 1 | from typing import Union, TYPE_CHECKING, Optional, Any 2 | from abc import abstractmethod 3 | from sycamore.data import Element, BoundingBox 4 | from io import IOBase 5 | 6 | if TYPE_CHECKING: 7 | from PIL.Image import Image 8 | from pdfminer.pdfpage import PDFPage 9 | 10 | 11 | class TextExtractor: 12 | @abstractmethod 13 | def extract_page(self, filename: Optional[Union["PDFPage", "Image"]]) -> list[Element]: 14 | pass 15 | 16 | @abstractmethod 17 | def extract_document( 18 | self, filename: Union[str, IOBase], hash_key: str, use_cache=False, **kwargs 19 | ) -> list[list[Element]]: 20 | pass 21 | 22 | def parse_output(self, output: list[dict[str, Any]], width, height) -> list[Element]: 23 | texts: list[Element] = [] 24 | for obj in output: 25 | obj_bbox = obj.get("bbox") 26 | obj_text = obj.get("text") 27 | if obj_bbox and obj_text and not obj_bbox.is_empty(): 28 | text = Element() 29 | text.type = "text" 30 | text.bbox = BoundingBox( 31 | obj_bbox.x1 / width, 32 | obj_bbox.y1 / height, 33 | obj_bbox.x2 / width, 34 | obj_bbox.y2 / height, 35 | ) 36 | text.text_representation = obj_text 37 | if "font_size" in obj: 38 | text.properties["font_size"] = obj["font_size"] 39 | texts.append(text) 40 | return texts 41 | 42 | def __name__(self): 43 | return "TextExtractor" 44 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | from itertools import islice 5 | 6 | __all__ = [ 7 | "batched", 8 | "choose_device", 9 | ] 10 | 11 | 12 | def batched(iterable, chunk_size): 13 | iterator = iter(iterable) 14 | return iter(lambda: list(islice(iterator, chunk_size)), list()) 15 | 16 | 17 | def choose_device(want: Optional[str], *, detr=False) -> str: 18 | if os.environ.get("DISABLE_GPU") == "1": 19 | return "cpu" 20 | if want: 21 | return want 22 | 23 | from sycamore.utils.import_utils import import_modules 24 | 25 | import_modules("torch.cuda", extra="local-inference") 26 | import torch.cuda 27 | 28 | if torch.cuda.is_available(): 29 | return "cuda" 30 | 31 | return "cpu" # !!! as of 6/17/2024 on macs cpu is faster than mps 32 | 33 | import torch.backends.mps 34 | 35 | if torch.backends.mps.is_available(): 36 | if detr: 37 | import torch 38 | 39 | if torch.__version__ < "2.3": 40 | return "cpu" # Older torch doesn't support DETR on MPS 41 | return "mps" 42 | 43 | return "cpu" 44 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/deep_eq.py: -------------------------------------------------------------------------------- 1 | from sycamore.data import Element 2 | 3 | 4 | def assert_deep_eq(a, b, path): 5 | assert type(a) is type(b), f"type {a} {b} {path}" 6 | if a is None: 7 | assert b is None 8 | return True 9 | 10 | if isinstance(a, int) or isinstance(a, float) or isinstance(a, str): 11 | assert a == b, f"values {a} {b} at {path}" 12 | return True 13 | 14 | if isinstance(a, list) or isinstance(a, tuple): 15 | assert len(a) == len(b), f"length {len(a)} {len(b)} {path}" 16 | for i in range(len(a)): 17 | assert_deep_eq(a[i], b[i], path + [i]) 18 | return True 19 | 20 | if isinstance(a, dict): 21 | for k in a.keys(): 22 | assert k in b, f"missing {k} in b={b} at {path} from {a}" 23 | assert_deep_eq(a[k], b[k], path + [k]) 24 | for k in b.keys(): 25 | assert k in a, f"missing {k} in a={a} at {path} from {b}" 26 | 27 | return True 28 | 29 | if isinstance(a, Element): 30 | assert_deep_eq(a.data, b.data, path + [".data"]) 31 | return True 32 | 33 | if "__class__" in dir(a): 34 | for k in dir(a): 35 | if k.startswith("__"): 36 | continue 37 | assert k in dir(b) 38 | assert_deep_eq(getattr(a, k), getattr(b, k), path + ["." + k]) 39 | return True 40 | assert False, f"Don't know how to compare {a}/{type(a)} with {b} at {path}" 41 | 42 | 43 | def deep_eq(a, b): 44 | try: 45 | assert_deep_eq(a, b, []) 46 | return True 47 | except AssertionError as e: 48 | print(f"Equality failed: {e}") 49 | return False 50 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/deprecate.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from typing import Optional, Callable, TypeVar 3 | from typing_extensions import ParamSpec 4 | import warnings 5 | 6 | P = ParamSpec("P") 7 | T = TypeVar("T") 8 | 9 | 10 | def deprecated(version: Optional[str] = None, reason: Optional[str] = None): 11 | 12 | def decorator(fn: Callable[P, T]) -> Callable[P, T]: 13 | warn_msg = f"{fn.__name__} is deprecated" 14 | if version is not None: 15 | warn_msg += f" since version {version}" 16 | if reason is not None: 17 | warn_msg += f". Reason: {reason}" 18 | 19 | @wraps(fn) 20 | def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: 21 | warnings.warn(warn_msg, category=FutureWarning) 22 | return fn(*args, **kwargs) 23 | 24 | return wrapper 25 | 26 | return decorator 27 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/extract_json.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from json import JSONDecodeError 4 | from typing import Any 5 | 6 | 7 | def extract_json(payload: str) -> Any: 8 | """Given the provided payload, extract the JSON block from it.""" 9 | 10 | # Replace Python's None with JSON's null, being careful to not replace 11 | # strings that might contain "None" as part of their content 12 | payload = re.sub(r":\s*None\b", ": null", payload) 13 | 14 | try: 15 | return json.loads(payload) 16 | except (ValueError, TypeError, JSONDecodeError) as exc: 17 | # Sometimes the LLM makes up an escape code. In that case, 18 | # replace the escape char with its representation (e.g. \\x07) 19 | # and recurse. 20 | if isinstance(exc, JSONDecodeError) and "Invalid \\escape" in exc.msg: 21 | c = payload[exc.pos] 22 | payload = payload[: exc.pos] + repr(c)[1:-1] + payload[exc.pos + 1 :] 23 | return extract_json(payload) 24 | # It is possible that the LLM response includes a code block with JSON data. 25 | # Pull the JSON content out from it. 26 | pattern = r"```json([\s\S]*?)```" 27 | match = re.match(pattern, payload) 28 | if match: 29 | return json.loads(match.group(1)) 30 | else: 31 | raise ValueError("JSON block not found in LLM response: " + str(payload)) from exc 32 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/html_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from sycamore.data import Document, TableElement 4 | from sycamore.data.document import DocumentPropertyTypes 5 | 6 | 7 | def to_html_tables(doc: Document) -> list[Document]: 8 | new_docs = [] 9 | for table_num, e in enumerate(el for el in doc.elements if el.type == "table"): 10 | if not isinstance(e, TableElement) or e.table is None: 11 | raise ValueError(f"Unable to generate html string for element {e}") 12 | 13 | new_text = e.table.to_html(pretty=True, wrap_in_html=True) 14 | 15 | new_doc = Document(text_representation=new_text) 16 | 17 | new_doc.properties["path"] = doc.properties["path"] 18 | 19 | if DocumentPropertyTypes.PAGE_NUMBER in doc.properties: 20 | new_doc.properties[DocumentPropertyTypes.PAGE_NUMBER] = doc.properties[DocumentPropertyTypes.PAGE_NUMBER] 21 | new_doc.properties["table_num"] = table_num 22 | new_docs.append(new_doc) 23 | 24 | return new_docs 25 | 26 | 27 | def html_table_filename_fn(doc: Document) -> str: 28 | path = Path(doc.properties["path"]) 29 | base_name = ".".join(path.name.split(".")[0:-1]) 30 | if "table_num" in doc.properties: 31 | suffix = doc.properties["table_num"] 32 | else: 33 | suffix = doc.doc_id 34 | return f"{base_name}_table_{suffix}.html" 35 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/lineage_utils.py: -------------------------------------------------------------------------------- 1 | from sycamore.data import Document, MetadataDocument 2 | 3 | 4 | def update_lineage(from_docs: list[Document], to_docs: list[Document]) -> list[MetadataDocument]: 5 | from_ids = [d.lineage_id for d in from_docs] 6 | for d in to_docs: 7 | d.update_lineage_id() 8 | to_ids = [d.lineage_id for d in to_docs] 9 | 10 | return [MetadataDocument(lineage_links={"from_ids": from_ids, "to_ids": to_ids})] 11 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/memory_debugging.py: -------------------------------------------------------------------------------- 1 | import linecache 2 | import tracemalloc 3 | 4 | 5 | def display_top(snapshot, key_type="lineno", limit=10): 6 | snapshot = snapshot.filter_traces( 7 | ( 8 | tracemalloc.Filter(False, ""), 9 | tracemalloc.Filter(False, ""), 10 | ) 11 | ) 12 | top_stats = snapshot.statistics(key_type) 13 | 14 | print("Top %s lines" % limit) 15 | for index, stat in enumerate(top_stats[:limit], 1): 16 | frame = stat.traceback[0] 17 | print("#%s: %s:%s: %.1f KiB" % (index, frame.filename, frame.lineno, stat.size / 1024)) 18 | line = linecache.getline(frame.filename, frame.lineno).strip() 19 | if line: 20 | print(" %s" % line) 21 | 22 | other = top_stats[limit:] 23 | if other: 24 | size = sum(stat.size for stat in other) 25 | print("%s other: %.1f KiB" % (len(other), size / 1024)) 26 | total = sum(stat.size for stat in top_stats) 27 | print("Total allocated size: %.1f KiB" % (total / 1024)) 28 | 29 | 30 | def gc_tensor_dump(): 31 | if not tracemalloc.is_tracing(): 32 | return 33 | import torch 34 | import gc 35 | 36 | count = 0 37 | for obj in gc.get_objects(): 38 | try: 39 | if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)): 40 | # print(type(obj), obj.size()) 41 | count = count + 1 42 | except Exception as e: 43 | print(f"Exception {e}") 44 | print(f"Found {count} tensors") 45 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/merge_utils.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | from typing import Union 3 | 4 | 5 | def combine_strs_min_newline(*strs: Union[str, None]) -> str: 6 | def combine_str_min_newline(str1: str, str2: str) -> str: 7 | if str1.endswith("\n") or str2.startswith("\n"): 8 | return str1 + str2 9 | else: 10 | return str1 + "\n" + str2 11 | 12 | def safe_filter(strs): 13 | filtered = filter(None, strs) 14 | empty = True 15 | while True: 16 | try: 17 | n = next(filtered) 18 | empty = False 19 | yield n 20 | except StopIteration: 21 | break 22 | if empty: 23 | yield "" 24 | 25 | return reduce(combine_str_min_newline, safe_filter(strs)) 26 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/model_load.py: -------------------------------------------------------------------------------- 1 | from sycamore.utils.import_utils import requires_modules 2 | from sycamore.utils.time_trace import LogTime 3 | import fasteners 4 | from pathlib import Path 5 | 6 | from typing import TYPE_CHECKING 7 | 8 | if TYPE_CHECKING: 9 | from transformers import DeformableDetrForObjectDetection 10 | 11 | _DETR_LOCK_FILE = f"{Path.home()}/.cache/Aryn-Detr.lock" 12 | 13 | 14 | @requires_modules("transformers", "local_inference") 15 | def load_deformable_detr(model_name_or_path, device) -> "DeformableDetrForObjectDetection": 16 | """Load deformable detr without getting concurrency issues in 17 | jitc-ing the deformable attention kernel. 18 | 19 | Refactored out of: 20 | https://github.com/aryn-ai/sycamore/blob/7e6b62639ce9b8f63d56cb35a32837d1c97e711e/lib/sycamore/sycamore/transforms/detr_partitioner.py#L686 21 | """ 22 | from sycamore.utils.pytorch_dir import get_pytorch_build_directory 23 | 24 | with fasteners.InterProcessLock(_DETR_LOCK_FILE): 25 | lockfile = Path(get_pytorch_build_directory("MultiScaleDeformableAttention", False)) / "lock" 26 | lockfile.unlink(missing_ok=True) 27 | 28 | from transformers import DeformableDetrForObjectDetection 29 | 30 | LogTime("loading_model", point=True) 31 | with LogTime("loading_model", log_start=True): 32 | model = DeformableDetrForObjectDetection.from_pretrained(model_name_or_path).to(device) 33 | return model 34 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/nested.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | 4 | def nested_lookup(d: Any, keys: list[str]) -> Any: 5 | # Eventually we can support integer indexes into tuples and lists also 6 | while len(keys) > 0: 7 | if d is None: 8 | return None 9 | try: 10 | if isinstance(keys[0], str) and hasattr(d, keys[0]): 11 | # This is necessary to handle attributes with a property 12 | # getter that returns something different than what's in the 13 | # underlying dict. For example the text_representation for 14 | # a TableElement. 15 | d = getattr(d, keys[0]) 16 | else: 17 | d = d.get(keys[0]) 18 | except (AttributeError, ValueError): 19 | return None 20 | 21 | keys = keys[1:] 22 | 23 | return d 24 | 25 | 26 | def dotted_lookup(d: Any, keys: str) -> Any: 27 | return nested_lookup(d, keys.split(".")) 28 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/pickle_pydantic.py: -------------------------------------------------------------------------------- 1 | import io 2 | from typing import Any, Dict, Optional, Type 3 | from ray import cloudpickle 4 | from pydantic import BaseModel 5 | 6 | 7 | def safe_cloudpickle(obj: Any) -> bytes: 8 | model_namespaces = {} 9 | 10 | with io.BytesIO() as f: 11 | pickler = cloudpickle.CloudPickler(f) 12 | 13 | for ModelClass in BaseModel.__subclasses__(): 14 | model_namespaces[ModelClass] = ModelClass.__pydantic_parent_namespace__ 15 | ModelClass.__pydantic_parent_namespace__ = None 16 | 17 | try: 18 | pickler.dump(obj) 19 | return f.getvalue() 20 | finally: 21 | for ModelClass, namespace in model_namespaces.items(): 22 | ModelClass.__pydantic_parent_namespace__ = namespace 23 | 24 | 25 | def safe_cloudunpickle(pickled_obj: bytes) -> Any: 26 | model_namespaces: Dict[Type[BaseModel], Optional[Dict[str, Any]]] = {} 27 | 28 | # Collect the current parent namespaces before unpickling 29 | for ModelClass in BaseModel.__subclasses__(): 30 | model_namespaces[ModelClass] = getattr(ModelClass, "__pydantic_parent_namespace__", None) 31 | ModelClass.__pydantic_parent_namespace__ = None 32 | 33 | try: 34 | with io.BytesIO(pickled_obj) as f: 35 | obj = cloudpickle.load(f) 36 | return obj 37 | finally: 38 | # Restore the original __pydantic_parent_namespace__ attributes 39 | for ModelClass, namespace in model_namespaces.items(): 40 | ModelClass.__pydantic_parent_namespace__ = namespace 41 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/ray_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | 4 | def check_serializable(*objects): 5 | from ray.util import inspect_serializability 6 | import io 7 | 8 | log = io.StringIO() 9 | ok, s = inspect_serializability(objects, print_file=log) 10 | if not ok: 11 | raise ValueError(f"Something isn't serializable: {s}\nLog: {log.getvalue()}") 12 | 13 | 14 | def handle_serialization_exception(*objects): 15 | def decorator(func): 16 | @functools.wraps(func) 17 | def wrapper(self, *args, **kwargs): 18 | try: 19 | return func(self, *args, **kwargs) 20 | except TypeError: 21 | attrs = [getattr(self, attr) for attr in objects] 22 | check_serializable(*tuple(attrs)) 23 | 24 | return wrapper 25 | 26 | return decorator 27 | -------------------------------------------------------------------------------- /lib/sycamore/sycamore/utils/similarity.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from sycamore.transforms.similarity import SimilarityScorer 3 | 4 | 5 | def make_element_sorter_fn(field: str, similarity_query: Optional[str], similarity_scorer: Optional[SimilarityScorer]): 6 | if similarity_query is None: 7 | return lambda d: None 8 | 9 | assert similarity_scorer is not None, "Similarity sorting requires a scorer" 10 | 11 | def f(doc): 12 | score_property_name = f"{field}_similarity_score" 13 | doc = similarity_scorer.generate_similarity_scores( 14 | doc_batch=[doc], query=similarity_query, score_property_name=score_property_name 15 | )[0] 16 | doc.elements.sort(key=lambda e: e.properties.get(score_property_name, float("-inf")), reverse=True) 17 | 18 | return f 19 | -------------------------------------------------------------------------------- /notebooks/EBGaramond-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryn-ai/sycamore/c0e562d49b855c344d93c16c714b713c4491aa91/notebooks/EBGaramond-Bold.ttf --------------------------------------------------------------------------------